#[cfg(feature = "cuda")]
use cudarc::cusolver as cusolver_safe;
#[cfg(feature = "cuda")]
use cudarc::driver::{DevicePtr, DevicePtrMut};
#[cfg(feature = "cuda")]
use crate::buffer::CudaBuffer;
#[cfg(feature = "cuda")]
use crate::device::GpuDevice;
#[cfg(feature = "cuda")]
use crate::error::{GpuError, GpuResult};
#[cfg(not(feature = "cuda"))]
use crate::device::GpuDevice;
#[cfg(not(feature = "cuda"))]
use crate::error::{GpuError, GpuResult};
fn transpose_f32(data: &[f32], m: usize, n: usize) -> Vec<f32> {
let mut out = vec![0.0f32; m * n];
for i in 0..m {
for j in 0..n {
out[j * m + i] = data[i * n + j];
}
}
out
}
fn transpose_f64(data: &[f64], m: usize, n: usize) -> Vec<f64> {
let mut out = vec![0.0f64; m * n];
for i in 0..m {
for j in 0..n {
out[j * m + i] = data[i * n + j];
}
}
out
}
#[cfg(feature = "cuda")]
fn read_dev_info(info_buf: &CudaBuffer<i32>, device: &GpuDevice) -> GpuResult<i32> {
let host = crate::transfer::gpu_to_cpu(info_buf, device)?;
Ok(host[0])
}
#[cfg(feature = "cuda")]
fn alloc_zeros_i32(len: usize, device: &GpuDevice) -> GpuResult<CudaBuffer<i32>> {
crate::transfer::alloc_zeros::<i32>(len, device)
}
#[cfg(feature = "cuda")]
fn upload_f32(data: &[f32], device: &GpuDevice) -> GpuResult<CudaBuffer<f32>> {
crate::transfer::cpu_to_gpu(data, device)
}
#[cfg(feature = "cuda")]
fn download_f32(buf: &CudaBuffer<f32>, device: &GpuDevice) -> GpuResult<Vec<f32>> {
crate::transfer::gpu_to_cpu(buf, device)
}
#[cfg(feature = "cuda")]
fn upload_f64(data: &[f64], device: &GpuDevice) -> GpuResult<CudaBuffer<f64>> {
crate::transfer::cpu_to_gpu(data, device)
}
#[cfg(feature = "cuda")]
fn download_f64(buf: &CudaBuffer<f64>, device: &GpuDevice) -> GpuResult<Vec<f64>> {
crate::transfer::gpu_to_cpu(buf, device)
}
#[cfg(feature = "cuda")]
pub fn gpu_svd_f32(
data: &[f32],
m: usize,
n: usize,
device: &GpuDevice,
) -> GpuResult<(Vec<f32>, Vec<f32>, Vec<f32>)> {
use cudarc::cusolver::sys as csys;
if m == 0 || n == 0 {
return Ok((vec![], vec![], vec![]));
}
let k = m.min(n);
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let col_major = transpose_f32(data, m, n);
let mut d_a = upload_f32(&col_major, device)?;
let mut d_s = crate::transfer::alloc_zeros_f32(k, device)?;
let mut d_u = crate::transfer::alloc_zeros_f32(m * k, device)?;
let mut d_vt = crate::transfer::alloc_zeros_f32(k * n, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
csys::cusolverDnSgesvd_bufferSize(dn.cu(), m as i32, n as i32, &mut lwork).result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f32(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (s_ptr, _s_sync) = d_s.inner_mut().device_ptr_mut(&stream);
let (u_ptr, _u_sync) = d_u.inner_mut().device_ptr_mut(&stream);
let (vt_ptr, _vt_sync) = d_vt.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSgesvd(
dn.cu(),
b'S' as i8, b'S' as i8, m as i32,
n as i32,
a_ptr as *mut f32,
m as i32, s_ptr as *mut f32,
u_ptr as *mut f32,
m as i32, vt_ptr as *mut f32,
k as i32, work_ptr as *mut f32,
lwork,
std::ptr::null_mut(), info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_svd_f32",
expected: vec![0],
got: vec![info_val as usize],
});
}
let s_host = download_f32(&d_s, device)?;
let u_col = download_f32(&d_u, device)?;
let mut u_host = vec![0.0f32; m * k];
for i in 0..m {
for j in 0..k {
u_host[i * k + j] = u_col[j * m + i];
}
}
let vt_col = download_f32(&d_vt, device)?;
let mut vt_host = vec![0.0f32; k * n];
for i in 0..k {
for j in 0..n {
vt_host[i * n + j] = vt_col[j * k + i];
}
}
Ok((u_host, s_host, vt_host))
}
#[cfg(feature = "cuda")]
pub fn gpu_svd_f64(
data: &[f64],
m: usize,
n: usize,
device: &GpuDevice,
) -> GpuResult<(Vec<f64>, Vec<f64>, Vec<f64>)> {
use cudarc::cusolver::sys as csys;
if m == 0 || n == 0 {
return Ok((vec![], vec![], vec![]));
}
let k = m.min(n);
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let col_major = transpose_f64(data, m, n);
let mut d_a = upload_f64(&col_major, device)?;
let mut d_s = crate::transfer::alloc_zeros_f64(k, device)?;
let mut d_u = crate::transfer::alloc_zeros_f64(m * k, device)?;
let mut d_vt = crate::transfer::alloc_zeros_f64(k * n, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
csys::cusolverDnDgesvd_bufferSize(dn.cu(), m as i32, n as i32, &mut lwork).result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f64(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (s_ptr, _s_sync) = d_s.inner_mut().device_ptr_mut(&stream);
let (u_ptr, _u_sync) = d_u.inner_mut().device_ptr_mut(&stream);
let (vt_ptr, _vt_sync) = d_vt.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDgesvd(
dn.cu(),
b'S' as i8, b'S' as i8, m as i32,
n as i32,
a_ptr as *mut f64,
m as i32, s_ptr as *mut f64,
u_ptr as *mut f64,
m as i32, vt_ptr as *mut f64,
k as i32, work_ptr as *mut f64,
lwork,
std::ptr::null_mut(), info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_svd_f64",
expected: vec![0],
got: vec![info_val as usize],
});
}
let s_host = download_f64(&d_s, device)?;
let u_col = download_f64(&d_u, device)?;
let mut u_host = vec![0.0f64; m * k];
for i in 0..m {
for j in 0..k {
u_host[i * k + j] = u_col[j * m + i];
}
}
let vt_col = download_f64(&d_vt, device)?;
let mut vt_host = vec![0.0f64; k * n];
for i in 0..k {
for j in 0..n {
vt_host[i * n + j] = vt_col[j * k + i];
}
}
Ok((u_host, s_host, vt_host))
}
#[cfg(feature = "cuda")]
pub fn gpu_cholesky_f32(data: &[f32], n: usize, device: &GpuDevice) -> GpuResult<Vec<f32>> {
use cudarc::cusolver::sys as csys;
if n == 0 {
return Ok(vec![]);
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let col_major = transpose_f32(data, n, n);
let mut d_a = upload_f32(&col_major, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSpotrf_bufferSize(
dn.cu(),
csys::cublasFillMode_t::CUBLAS_FILL_MODE_LOWER,
n as i32,
a_ptr as *mut f32,
n as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f32(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSpotrf(
dn.cu(),
csys::cublasFillMode_t::CUBLAS_FILL_MODE_LOWER,
n as i32,
a_ptr as *mut f32,
n as i32,
work_ptr as *mut f32,
lwork,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_cholesky_f32: matrix is not positive-definite",
expected: vec![0],
got: vec![info_val as usize],
});
}
let l_col = download_f32(&d_a, device)?;
let mut l_host = vec![0.0f32; n * n];
for i in 0..n {
for j in 0..n {
l_host[i * n + j] = l_col[j * n + i];
}
}
for i in 0..n {
for j in (i + 1)..n {
l_host[i * n + j] = 0.0;
}
}
Ok(l_host)
}
#[cfg(feature = "cuda")]
pub fn gpu_cholesky_f64(data: &[f64], n: usize, device: &GpuDevice) -> GpuResult<Vec<f64>> {
use cudarc::cusolver::sys as csys;
if n == 0 {
return Ok(vec![]);
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let col_major = transpose_f64(data, n, n);
let mut d_a = upload_f64(&col_major, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDpotrf_bufferSize(
dn.cu(),
csys::cublasFillMode_t::CUBLAS_FILL_MODE_LOWER,
n as i32,
a_ptr as *mut f64,
n as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f64(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDpotrf(
dn.cu(),
csys::cublasFillMode_t::CUBLAS_FILL_MODE_LOWER,
n as i32,
a_ptr as *mut f64,
n as i32,
work_ptr as *mut f64,
lwork,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_cholesky_f64: matrix is not positive-definite",
expected: vec![0],
got: vec![info_val as usize],
});
}
let l_col = download_f64(&d_a, device)?;
let mut l_host = vec![0.0f64; n * n];
for i in 0..n {
for j in 0..n {
l_host[i * n + j] = l_col[j * n + i];
}
}
for i in 0..n {
for j in (i + 1)..n {
l_host[i * n + j] = 0.0;
}
}
Ok(l_host)
}
#[cfg(feature = "cuda")]
pub fn gpu_solve_f32(
a_data: &[f32],
b_data: &[f32],
n: usize,
nrhs: usize,
device: &GpuDevice,
) -> GpuResult<Vec<f32>> {
use cudarc::cusolver::sys as csys;
if n == 0 {
return Ok(vec![]);
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let a_col = transpose_f32(a_data, n, n);
let mut d_a = upload_f32(&a_col, device)?;
let b_col = transpose_f32(b_data, n, nrhs);
let mut d_b = upload_f32(&b_col, device)?;
let mut d_ipiv = alloc_zeros_i32(n, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSgetrf_bufferSize(
dn.cu(),
n as i32,
n as i32,
a_ptr as *mut f32,
n as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f32(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (ipiv_ptr, _ipiv_sync) = d_ipiv.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSgetrf(
dn.cu(),
n as i32,
n as i32,
a_ptr as *mut f32,
n as i32,
work_ptr as *mut f32,
ipiv_ptr as *mut i32,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_solve_f32: LU factorization failed (singular matrix)",
expected: vec![0],
got: vec![info_val as usize],
});
}
let mut d_info2 = alloc_zeros_i32(1, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner().device_ptr(&stream);
let (ipiv_ptr, _ipiv_sync) = d_ipiv.inner().device_ptr(&stream);
let (b_ptr, _b_sync) = d_b.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info2.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSgetrs(
dn.cu(),
csys::cublasOperation_t::CUBLAS_OP_N, n as i32,
nrhs as i32,
a_ptr as *const f32,
n as i32,
ipiv_ptr as *const i32,
b_ptr as *mut f32,
n as i32, info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val2 = read_dev_info(&d_info2, device)?;
if info_val2 != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_solve_f32: triangular solve failed",
expected: vec![0],
got: vec![info_val2 as usize],
});
}
let x_col = download_f32(&d_b, device)?;
let mut x_host = vec![0.0f32; n * nrhs];
for i in 0..n {
for j in 0..nrhs {
x_host[i * nrhs + j] = x_col[j * n + i];
}
}
Ok(x_host)
}
#[cfg(feature = "cuda")]
pub fn gpu_solve_f64(
a_data: &[f64],
b_data: &[f64],
n: usize,
nrhs: usize,
device: &GpuDevice,
) -> GpuResult<Vec<f64>> {
use cudarc::cusolver::sys as csys;
if n == 0 {
return Ok(vec![]);
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let a_col = transpose_f64(a_data, n, n);
let mut d_a = upload_f64(&a_col, device)?;
let b_col = transpose_f64(b_data, n, nrhs);
let mut d_b = upload_f64(&b_col, device)?;
let mut d_ipiv = alloc_zeros_i32(n, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDgetrf_bufferSize(
dn.cu(),
n as i32,
n as i32,
a_ptr as *mut f64,
n as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f64(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (ipiv_ptr, _ipiv_sync) = d_ipiv.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDgetrf(
dn.cu(),
n as i32,
n as i32,
a_ptr as *mut f64,
n as i32,
work_ptr as *mut f64,
ipiv_ptr as *mut i32,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_solve_f64: LU factorization failed (singular matrix)",
expected: vec![0],
got: vec![info_val as usize],
});
}
let mut d_info2 = alloc_zeros_i32(1, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner().device_ptr(&stream);
let (ipiv_ptr, _ipiv_sync) = d_ipiv.inner().device_ptr(&stream);
let (b_ptr, _b_sync) = d_b.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info2.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDgetrs(
dn.cu(),
csys::cublasOperation_t::CUBLAS_OP_N, n as i32,
nrhs as i32,
a_ptr as *const f64,
n as i32,
ipiv_ptr as *const i32,
b_ptr as *mut f64,
n as i32, info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val2 = read_dev_info(&d_info2, device)?;
if info_val2 != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_solve_f64: triangular solve failed",
expected: vec![0],
got: vec![info_val2 as usize],
});
}
let x_col = download_f64(&d_b, device)?;
let mut x_host = vec![0.0f64; n * nrhs];
for i in 0..n {
for j in 0..nrhs {
x_host[i * nrhs + j] = x_col[j * n + i];
}
}
Ok(x_host)
}
#[cfg(feature = "cuda")]
pub fn gpu_lu_factor_f32(
a_dev: &CudaBuffer<f32>,
n: usize,
device: &GpuDevice,
) -> GpuResult<(CudaBuffer<f32>, CudaBuffer<i32>)> {
use cudarc::cusolver::sys as csys;
if n == 0 {
return Ok((
crate::transfer::alloc_zeros_f32(0, device)?,
alloc_zeros_i32(0, device)?,
));
}
if a_dev.len() != n * n {
return Err(GpuError::ShapeMismatch {
op: "gpu_lu_factor_f32: input length mismatch",
expected: vec![n * n],
got: vec![a_dev.len()],
});
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let mut d_a_col = crate::kernels::gpu_transpose_2d(a_dev, n, n, device)?;
let mut d_ipiv = alloc_zeros_i32(n, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a_col.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSgetrf_bufferSize(
dn.cu(),
n as i32,
n as i32,
a_ptr as *mut f32,
n as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f32(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a_col.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (ipiv_ptr, _ipiv_sync) = d_ipiv.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSgetrf(
dn.cu(),
n as i32,
n as i32,
a_ptr as *mut f32,
n as i32,
work_ptr as *mut f32,
ipiv_ptr as *mut i32,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_lu_factor_f32: getrf failed (singular matrix)",
expected: vec![0],
got: vec![info_val as usize],
});
}
let lu_row = crate::kernels::gpu_transpose_2d(&d_a_col, n, n, device)?;
Ok((lu_row, d_ipiv))
}
#[cfg(feature = "cuda")]
pub fn gpu_lu_factor_f64(
a_dev: &CudaBuffer<f64>,
n: usize,
device: &GpuDevice,
) -> GpuResult<(CudaBuffer<f64>, CudaBuffer<i32>)> {
use cudarc::cusolver::sys as csys;
if n == 0 {
return Ok((
crate::transfer::alloc_zeros_f64(0, device)?,
alloc_zeros_i32(0, device)?,
));
}
if a_dev.len() != n * n {
return Err(GpuError::ShapeMismatch {
op: "gpu_lu_factor_f64: input length mismatch",
expected: vec![n * n],
got: vec![a_dev.len()],
});
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let mut d_a_col = crate::kernels::gpu_transpose_2d_f64(a_dev, n, n, device)?;
let mut d_ipiv = alloc_zeros_i32(n, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a_col.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDgetrf_bufferSize(
dn.cu(),
n as i32,
n as i32,
a_ptr as *mut f64,
n as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f64(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a_col.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (ipiv_ptr, _ipiv_sync) = d_ipiv.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDgetrf(
dn.cu(),
n as i32,
n as i32,
a_ptr as *mut f64,
n as i32,
work_ptr as *mut f64,
ipiv_ptr as *mut i32,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_lu_factor_f64: getrf failed (singular matrix)",
expected: vec![0],
got: vec![info_val as usize],
});
}
let lu_row = crate::kernels::gpu_transpose_2d_f64(&d_a_col, n, n, device)?;
Ok((lu_row, d_ipiv))
}
#[cfg(feature = "cuda")]
pub fn gpu_cholesky_f32_dev(
a_dev: &CudaBuffer<f32>,
n: usize,
device: &GpuDevice,
) -> GpuResult<CudaBuffer<f32>> {
use cudarc::cusolver::sys as csys;
if a_dev.len() != n * n {
return Err(GpuError::ShapeMismatch {
op: "gpu_cholesky_f32_dev: A length mismatch",
expected: vec![n * n],
got: vec![a_dev.len()],
});
}
if n == 0 {
return crate::transfer::alloc_zeros_f32(0, device);
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let mut d_a = crate::transfer::alloc_zeros_f32(n * n, device)?;
stream.memcpy_dtod(a_dev.inner(), d_a.inner_mut())?;
let mut d_info = alloc_zeros_i32(1, device)?;
let uplo = csys::cublasFillMode_t::CUBLAS_FILL_MODE_LOWER;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSpotrf_bufferSize(
dn.cu(),
uplo,
n as i32,
a_ptr as *mut f32,
n as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f32(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSpotrf(
dn.cu(),
uplo,
n as i32,
a_ptr as *mut f32,
n as i32,
work_ptr as *mut f32,
lwork,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_cholesky_f32_dev: potrf failed (matrix not positive definite)",
expected: vec![0],
got: vec![info_val as usize],
});
}
let host = crate::transfer::gpu_to_cpu(&d_a, device)?;
let mut row_major = vec![0.0_f32; n * n];
for j in 0..n {
for i in 0..n {
let cm = j * n + i; if i >= j {
row_major[i * n + j] = host[cm];
}
}
}
crate::transfer::cpu_to_gpu(&row_major, device)
}
#[cfg(feature = "cuda")]
pub fn gpu_cholesky_f64_dev(
a_dev: &CudaBuffer<f64>,
n: usize,
device: &GpuDevice,
) -> GpuResult<CudaBuffer<f64>> {
use cudarc::cusolver::sys as csys;
if a_dev.len() != n * n {
return Err(GpuError::ShapeMismatch {
op: "gpu_cholesky_f64_dev: A length mismatch",
expected: vec![n * n],
got: vec![a_dev.len()],
});
}
if n == 0 {
return crate::transfer::alloc_zeros_f64(0, device);
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let mut d_a = crate::transfer::alloc_zeros_f64(n * n, device)?;
stream.memcpy_dtod(a_dev.inner(), d_a.inner_mut())?;
let mut d_info = alloc_zeros_i32(1, device)?;
let uplo = csys::cublasFillMode_t::CUBLAS_FILL_MODE_LOWER;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDpotrf_bufferSize(
dn.cu(),
uplo,
n as i32,
a_ptr as *mut f64,
n as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f64(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDpotrf(
dn.cu(),
uplo,
n as i32,
a_ptr as *mut f64,
n as i32,
work_ptr as *mut f64,
lwork,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_cholesky_f64_dev: potrf failed (matrix not positive definite)",
expected: vec![0],
got: vec![info_val as usize],
});
}
let host = crate::transfer::gpu_to_cpu(&d_a, device)?;
let mut row_major = vec![0.0_f64; n * n];
for j in 0..n {
for i in 0..n {
let cm = j * n + i;
if i >= j {
row_major[i * n + j] = host[cm];
}
}
}
crate::transfer::cpu_to_gpu(&row_major, device)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_cholesky_f32_dev(
_a: &CudaBuffer<f32>,
_n: usize,
_device: &GpuDevice,
) -> GpuResult<CudaBuffer<f32>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_cholesky_f64_dev(
_a: &CudaBuffer<f64>,
_n: usize,
_device: &GpuDevice,
) -> GpuResult<CudaBuffer<f64>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(feature = "cuda")]
pub fn gpu_solve_f32_dev(
a_dev: &CudaBuffer<f32>,
b_dev: &CudaBuffer<f32>,
n: usize,
nrhs: usize,
device: &GpuDevice,
) -> GpuResult<CudaBuffer<f32>> {
use cudarc::cusolver::sys as csys;
if a_dev.len() != n * n {
return Err(GpuError::ShapeMismatch {
op: "gpu_solve_f32_dev: A length mismatch",
expected: vec![n * n],
got: vec![a_dev.len()],
});
}
if b_dev.len() != n * nrhs {
return Err(GpuError::ShapeMismatch {
op: "gpu_solve_f32_dev: B length mismatch",
expected: vec![n * nrhs],
got: vec![b_dev.len()],
});
}
if n == 0 {
return crate::transfer::alloc_zeros_f32(0, device);
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let mut d_a = crate::kernels::gpu_transpose_2d(a_dev, n, n, device)?;
let mut d_b = crate::kernels::gpu_transpose_2d(b_dev, n, nrhs, device)?;
let mut d_ipiv = alloc_zeros_i32(n, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSgetrf_bufferSize(
dn.cu(),
n as i32,
n as i32,
a_ptr as *mut f32,
n as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f32(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (ipiv_ptr, _ipiv_sync) = d_ipiv.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSgetrf(
dn.cu(),
n as i32,
n as i32,
a_ptr as *mut f32,
n as i32,
work_ptr as *mut f32,
ipiv_ptr as *mut i32,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_solve_f32_dev: LU factorization failed (singular matrix)",
expected: vec![0],
got: vec![info_val as usize],
});
}
let mut d_info2 = alloc_zeros_i32(1, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner().device_ptr(&stream);
let (ipiv_ptr, _ipiv_sync) = d_ipiv.inner().device_ptr(&stream);
let (b_ptr, _b_sync) = d_b.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info2.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSgetrs(
dn.cu(),
csys::cublasOperation_t::CUBLAS_OP_N,
n as i32,
nrhs as i32,
a_ptr as *const f32,
n as i32,
ipiv_ptr as *const i32,
b_ptr as *mut f32,
n as i32,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info2 = read_dev_info(&d_info2, device)?;
if info2 != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_solve_f32_dev: triangular solve failed",
expected: vec![0],
got: vec![info2 as usize],
});
}
crate::kernels::gpu_transpose_2d(&d_b, nrhs, n, device)
}
#[cfg(feature = "cuda")]
pub fn gpu_solve_f64_dev(
a_dev: &CudaBuffer<f64>,
b_dev: &CudaBuffer<f64>,
n: usize,
nrhs: usize,
device: &GpuDevice,
) -> GpuResult<CudaBuffer<f64>> {
use cudarc::cusolver::sys as csys;
if a_dev.len() != n * n {
return Err(GpuError::ShapeMismatch {
op: "gpu_solve_f64_dev: A length mismatch",
expected: vec![n * n],
got: vec![a_dev.len()],
});
}
if b_dev.len() != n * nrhs {
return Err(GpuError::ShapeMismatch {
op: "gpu_solve_f64_dev: B length mismatch",
expected: vec![n * nrhs],
got: vec![b_dev.len()],
});
}
if n == 0 {
return crate::transfer::alloc_zeros_f64(0, device);
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let mut d_a = crate::kernels::gpu_transpose_2d_f64(a_dev, n, n, device)?;
let mut d_b = crate::kernels::gpu_transpose_2d_f64(b_dev, n, nrhs, device)?;
let mut d_ipiv = alloc_zeros_i32(n, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDgetrf_bufferSize(
dn.cu(),
n as i32,
n as i32,
a_ptr as *mut f64,
n as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f64(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (ipiv_ptr, _ipiv_sync) = d_ipiv.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDgetrf(
dn.cu(),
n as i32,
n as i32,
a_ptr as *mut f64,
n as i32,
work_ptr as *mut f64,
ipiv_ptr as *mut i32,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_solve_f64_dev: LU factorization failed",
expected: vec![0],
got: vec![info_val as usize],
});
}
let mut d_info2 = alloc_zeros_i32(1, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner().device_ptr(&stream);
let (ipiv_ptr, _ipiv_sync) = d_ipiv.inner().device_ptr(&stream);
let (b_ptr, _b_sync) = d_b.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info2.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDgetrs(
dn.cu(),
csys::cublasOperation_t::CUBLAS_OP_N,
n as i32,
nrhs as i32,
a_ptr as *const f64,
n as i32,
ipiv_ptr as *const i32,
b_ptr as *mut f64,
n as i32,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info2 = read_dev_info(&d_info2, device)?;
if info2 != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_solve_f64_dev: triangular solve failed",
expected: vec![0],
got: vec![info2 as usize],
});
}
crate::kernels::gpu_transpose_2d_f64(&d_b, nrhs, n, device)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_solve_f32_dev(
_a: &CudaBuffer<f32>,
_b: &CudaBuffer<f32>,
_n: usize,
_nrhs: usize,
_device: &GpuDevice,
) -> GpuResult<CudaBuffer<f32>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_solve_f64_dev(
_a: &CudaBuffer<f64>,
_b: &CudaBuffer<f64>,
_n: usize,
_nrhs: usize,
_device: &GpuDevice,
) -> GpuResult<CudaBuffer<f64>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(feature = "cuda")]
pub fn gpu_lstsq_f32(
a_dev: &CudaBuffer<f32>,
b_dev: &CudaBuffer<f32>,
m: usize,
n: usize,
nrhs: usize,
device: &GpuDevice,
) -> GpuResult<CudaBuffer<f32>> {
use cudarc::cusolver::sys as csys;
if a_dev.len() != m * n {
return Err(GpuError::ShapeMismatch {
op: "gpu_lstsq_f32: A length mismatch",
expected: vec![m * n],
got: vec![a_dev.len()],
});
}
if b_dev.len() != m * nrhs {
return Err(GpuError::ShapeMismatch {
op: "gpu_lstsq_f32: B length mismatch",
expected: vec![m * nrhs],
got: vec![b_dev.len()],
});
}
if m == 0 || n == 0 || nrhs == 0 {
return crate::transfer::alloc_zeros_f32(n * nrhs, device);
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let mut d_a_col = crate::kernels::gpu_transpose_2d(a_dev, m, n, device)?;
let mut d_b_col = crate::kernels::gpu_transpose_2d(b_dev, m, nrhs, device)?;
let mut d_x = crate::transfer::alloc_zeros_f32(n * nrhs, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut iter: i32 = 0;
let mut lwork_bytes: usize = 0;
unsafe {
let (a_ptr, _a_sync) = d_a_col.inner_mut().device_ptr_mut(&stream);
let (b_ptr, _b_sync) = d_b_col.inner_mut().device_ptr_mut(&stream);
let (x_ptr, _x_sync) = d_x.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSSgels_bufferSize(
dn.cu(),
m as i32,
n as i32,
nrhs as i32,
a_ptr as *mut f32,
m as i32,
b_ptr as *mut f32,
m as i32,
x_ptr as *mut f32,
n as i32,
std::ptr::null_mut(),
&mut lwork_bytes,
)
.result()?;
}
let work_elems = lwork_bytes.div_ceil(4).max(1);
let mut d_work = crate::transfer::alloc_zeros_f32(work_elems, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a_col.inner_mut().device_ptr_mut(&stream);
let (b_ptr, _b_sync) = d_b_col.inner_mut().device_ptr_mut(&stream);
let (x_ptr, _x_sync) = d_x.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSSgels(
dn.cu(),
m as i32,
n as i32,
nrhs as i32,
a_ptr as *mut f32,
m as i32,
b_ptr as *mut f32,
m as i32,
x_ptr as *mut f32,
n as i32,
work_ptr as *mut std::ffi::c_void,
lwork_bytes,
&mut iter,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val < 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_lstsq_f32: SSgels reported an invalid argument",
expected: vec![0],
got: vec![info_val.unsigned_abs() as usize],
});
}
crate::kernels::gpu_transpose_2d(&d_x, n, nrhs, device)
}
#[cfg(feature = "cuda")]
pub fn gpu_lstsq_f64(
a_dev: &CudaBuffer<f64>,
b_dev: &CudaBuffer<f64>,
m: usize,
n: usize,
nrhs: usize,
device: &GpuDevice,
) -> GpuResult<CudaBuffer<f64>> {
use cudarc::cusolver::sys as csys;
if a_dev.len() != m * n {
return Err(GpuError::ShapeMismatch {
op: "gpu_lstsq_f64: A length mismatch",
expected: vec![m * n],
got: vec![a_dev.len()],
});
}
if b_dev.len() != m * nrhs {
return Err(GpuError::ShapeMismatch {
op: "gpu_lstsq_f64: B length mismatch",
expected: vec![m * nrhs],
got: vec![b_dev.len()],
});
}
if m == 0 || n == 0 || nrhs == 0 {
return crate::transfer::alloc_zeros_f64(n * nrhs, device);
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let mut d_a_col = crate::kernels::gpu_transpose_2d_f64(a_dev, m, n, device)?;
let mut d_b_col = crate::kernels::gpu_transpose_2d_f64(b_dev, m, nrhs, device)?;
let mut d_x = crate::transfer::alloc_zeros_f64(n * nrhs, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut iter: i32 = 0;
let mut lwork_bytes: usize = 0;
unsafe {
let (a_ptr, _a_sync) = d_a_col.inner_mut().device_ptr_mut(&stream);
let (b_ptr, _b_sync) = d_b_col.inner_mut().device_ptr_mut(&stream);
let (x_ptr, _x_sync) = d_x.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDDgels_bufferSize(
dn.cu(),
m as i32,
n as i32,
nrhs as i32,
a_ptr as *mut f64,
m as i32,
b_ptr as *mut f64,
m as i32,
x_ptr as *mut f64,
n as i32,
std::ptr::null_mut(),
&mut lwork_bytes,
)
.result()?;
}
let work_elems = lwork_bytes.div_ceil(8).max(1);
let mut d_work = crate::transfer::alloc_zeros_f64(work_elems, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a_col.inner_mut().device_ptr_mut(&stream);
let (b_ptr, _b_sync) = d_b_col.inner_mut().device_ptr_mut(&stream);
let (x_ptr, _x_sync) = d_x.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDDgels(
dn.cu(),
m as i32,
n as i32,
nrhs as i32,
a_ptr as *mut f64,
m as i32,
b_ptr as *mut f64,
m as i32,
x_ptr as *mut f64,
n as i32,
work_ptr as *mut std::ffi::c_void,
lwork_bytes,
&mut iter,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val < 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_lstsq_f64: DDgels reported an invalid argument",
expected: vec![0],
got: vec![info_val.unsigned_abs() as usize],
});
}
crate::kernels::gpu_transpose_2d_f64(&d_x, n, nrhs, device)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_lstsq_f32(
_a: &CudaBuffer<f32>,
_b: &CudaBuffer<f32>,
_m: usize,
_n: usize,
_nrhs: usize,
_device: &GpuDevice,
) -> GpuResult<CudaBuffer<f32>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_lstsq_f64(
_a: &CudaBuffer<f64>,
_b: &CudaBuffer<f64>,
_m: usize,
_n: usize,
_nrhs: usize,
_device: &GpuDevice,
) -> GpuResult<CudaBuffer<f64>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(feature = "cuda")]
struct DnParamsHandle {
inner: cudarc::cusolver::sys::cusolverDnParams_t,
}
#[cfg(feature = "cuda")]
impl DnParamsHandle {
fn new() -> GpuResult<Self> {
use cudarc::cusolver::sys as csys;
let mut p: csys::cusolverDnParams_t = std::ptr::null_mut();
unsafe {
csys::cusolverDnCreateParams(&mut p).result()?;
}
Ok(Self { inner: p })
}
fn raw(&self) -> cudarc::cusolver::sys::cusolverDnParams_t {
self.inner
}
}
#[cfg(feature = "cuda")]
impl Drop for DnParamsHandle {
fn drop(&mut self) {
if !self.inner.is_null() {
unsafe {
let _ = cudarc::cusolver::sys::cusolverDnDestroyParams(self.inner);
}
}
}
}
#[cfg(feature = "cuda")]
pub fn gpu_eig_f32(
a_dev: &CudaBuffer<f32>,
n: usize,
device: &GpuDevice,
) -> GpuResult<(CudaBuffer<f32>, CudaBuffer<f32>)> {
use cudarc::cusolver::sys as csys;
if a_dev.len() != n * n {
return Err(GpuError::ShapeMismatch {
op: "gpu_eig_f32: A length mismatch",
expected: vec![n * n],
got: vec![a_dev.len()],
});
}
if n == 0 {
return Ok((
crate::transfer::alloc_zeros_f32(0, device)?,
crate::transfer::alloc_zeros_f32(0, device)?,
));
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let params = DnParamsHandle::new()?;
let mut d_a_col = crate::kernels::gpu_transpose_2d(a_dev, n, n, device)?;
let mut d_w = crate::transfer::alloc_zeros_f32(2 * n, device)?;
let mut d_vr = crate::transfer::alloc_zeros_f32(2 * n * n, device)?;
let mut d_vl_dummy = crate::transfer::alloc_zeros_f32(2, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let novec = csys::cusolverEigMode_t::CUSOLVER_EIG_MODE_NOVECTOR;
let vec_mode = csys::cusolverEigMode_t::CUSOLVER_EIG_MODE_VECTOR;
let dt_a = csys::cudaDataType::CUDA_R_32F;
let dt_w = csys::cudaDataType::CUDA_C_32F;
let dt_v = csys::cudaDataType::CUDA_C_32F;
let compute_type = csys::cudaDataType::CUDA_R_32F;
let mut wks_dev_bytes: usize = 0;
let mut wks_host_bytes: usize = 0;
unsafe {
let (a_ptr, _a_sync) = d_a_col.inner_mut().device_ptr_mut(&stream);
let (w_ptr, _w_sync) = d_w.inner_mut().device_ptr_mut(&stream);
let (vr_ptr, _vr_sync) = d_vr.inner_mut().device_ptr_mut(&stream);
let (vl_ptr, _vl_sync) = d_vl_dummy.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnXgeev_bufferSize(
dn.cu(),
params.raw(),
novec,
vec_mode,
n as i64,
dt_a,
a_ptr as *const std::ffi::c_void,
n as i64,
dt_w,
w_ptr as *const std::ffi::c_void,
dt_v,
vl_ptr as *const std::ffi::c_void,
n as i64,
dt_v,
vr_ptr as *const std::ffi::c_void,
n as i64,
compute_type,
&mut wks_dev_bytes,
&mut wks_host_bytes,
)
.result()?;
}
let dev_work_elems = wks_dev_bytes.div_ceil(4).max(1);
let mut d_work = crate::transfer::alloc_zeros_f32(dev_work_elems, device)?;
let mut host_work: Vec<u8> = vec![0u8; wks_host_bytes];
unsafe {
let (a_ptr, _a_sync) = d_a_col.inner_mut().device_ptr_mut(&stream);
let (w_ptr, _w_sync) = d_w.inner_mut().device_ptr_mut(&stream);
let (vr_ptr, _vr_sync) = d_vr.inner_mut().device_ptr_mut(&stream);
let (vl_ptr, _vl_sync) = d_vl_dummy.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnXgeev(
dn.cu(),
params.raw(),
novec,
vec_mode,
n as i64,
dt_a,
a_ptr as *mut std::ffi::c_void,
n as i64,
dt_w,
w_ptr as *mut std::ffi::c_void,
dt_v,
vl_ptr as *mut std::ffi::c_void,
n as i64,
dt_v,
vr_ptr as *mut std::ffi::c_void,
n as i64,
compute_type,
work_ptr as *mut std::ffi::c_void,
wks_dev_bytes,
host_work.as_mut_ptr() as *mut std::ffi::c_void,
wks_host_bytes,
info_ptr as *mut std::ffi::c_int,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_eig_f32: Xgeev failed (non-converged eigenvalues)",
expected: vec![0],
got: vec![info_val.unsigned_abs() as usize],
});
}
let host = crate::transfer::gpu_to_cpu(&d_vr, device)?;
let mut row_major = vec![0.0_f32; 2 * n * n];
for j in 0..n {
for i in 0..n {
let src = (j * n + i) * 2;
let dst = (i * n + j) * 2;
row_major[dst] = host[src];
row_major[dst + 1] = host[src + 1];
}
}
let d_vr_row = crate::transfer::cpu_to_gpu(&row_major, device)?;
Ok((d_w, d_vr_row))
}
#[cfg(feature = "cuda")]
pub fn gpu_eig_f64(
a_dev: &CudaBuffer<f64>,
n: usize,
device: &GpuDevice,
) -> GpuResult<(CudaBuffer<f64>, CudaBuffer<f64>)> {
use cudarc::cusolver::sys as csys;
if a_dev.len() != n * n {
return Err(GpuError::ShapeMismatch {
op: "gpu_eig_f64: A length mismatch",
expected: vec![n * n],
got: vec![a_dev.len()],
});
}
if n == 0 {
return Ok((
crate::transfer::alloc_zeros_f64(0, device)?,
crate::transfer::alloc_zeros_f64(0, device)?,
));
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let params = DnParamsHandle::new()?;
let mut d_a_col = crate::kernels::gpu_transpose_2d_f64(a_dev, n, n, device)?;
let mut d_w = crate::transfer::alloc_zeros_f64(2 * n, device)?;
let mut d_vr = crate::transfer::alloc_zeros_f64(2 * n * n, device)?;
let mut d_vl_dummy = crate::transfer::alloc_zeros_f64(2, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let novec = csys::cusolverEigMode_t::CUSOLVER_EIG_MODE_NOVECTOR;
let vec_mode = csys::cusolverEigMode_t::CUSOLVER_EIG_MODE_VECTOR;
let dt_a = csys::cudaDataType::CUDA_R_64F;
let dt_w = csys::cudaDataType::CUDA_C_64F;
let dt_v = csys::cudaDataType::CUDA_C_64F;
let compute_type = csys::cudaDataType::CUDA_R_64F;
let mut wks_dev_bytes: usize = 0;
let mut wks_host_bytes: usize = 0;
unsafe {
let (a_ptr, _a_sync) = d_a_col.inner_mut().device_ptr_mut(&stream);
let (w_ptr, _w_sync) = d_w.inner_mut().device_ptr_mut(&stream);
let (vr_ptr, _vr_sync) = d_vr.inner_mut().device_ptr_mut(&stream);
let (vl_ptr, _vl_sync) = d_vl_dummy.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnXgeev_bufferSize(
dn.cu(),
params.raw(),
novec,
vec_mode,
n as i64,
dt_a,
a_ptr as *const std::ffi::c_void,
n as i64,
dt_w,
w_ptr as *const std::ffi::c_void,
dt_v,
vl_ptr as *const std::ffi::c_void,
n as i64,
dt_v,
vr_ptr as *const std::ffi::c_void,
n as i64,
compute_type,
&mut wks_dev_bytes,
&mut wks_host_bytes,
)
.result()?;
}
let dev_work_elems = wks_dev_bytes.div_ceil(8).max(1);
let mut d_work = crate::transfer::alloc_zeros_f64(dev_work_elems, device)?;
let mut host_work: Vec<u8> = vec![0u8; wks_host_bytes];
unsafe {
let (a_ptr, _a_sync) = d_a_col.inner_mut().device_ptr_mut(&stream);
let (w_ptr, _w_sync) = d_w.inner_mut().device_ptr_mut(&stream);
let (vr_ptr, _vr_sync) = d_vr.inner_mut().device_ptr_mut(&stream);
let (vl_ptr, _vl_sync) = d_vl_dummy.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnXgeev(
dn.cu(),
params.raw(),
novec,
vec_mode,
n as i64,
dt_a,
a_ptr as *mut std::ffi::c_void,
n as i64,
dt_w,
w_ptr as *mut std::ffi::c_void,
dt_v,
vl_ptr as *mut std::ffi::c_void,
n as i64,
dt_v,
vr_ptr as *mut std::ffi::c_void,
n as i64,
compute_type,
work_ptr as *mut std::ffi::c_void,
wks_dev_bytes,
host_work.as_mut_ptr() as *mut std::ffi::c_void,
wks_host_bytes,
info_ptr as *mut std::ffi::c_int,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_eig_f64: Xgeev failed",
expected: vec![0],
got: vec![info_val.unsigned_abs() as usize],
});
}
let host = crate::transfer::gpu_to_cpu(&d_vr, device)?;
let mut row_major = vec![0.0_f64; 2 * n * n];
for j in 0..n {
for i in 0..n {
let src = (j * n + i) * 2;
let dst = (i * n + j) * 2;
row_major[dst] = host[src];
row_major[dst + 1] = host[src + 1];
}
}
let d_vr_row = crate::transfer::cpu_to_gpu(&row_major, device)?;
Ok((d_w, d_vr_row))
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_eig_f32(
_a: &CudaBuffer<f32>,
_n: usize,
_device: &GpuDevice,
) -> GpuResult<(CudaBuffer<f32>, CudaBuffer<f32>)> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_eig_f64(
_a: &CudaBuffer<f64>,
_n: usize,
_device: &GpuDevice,
) -> GpuResult<(CudaBuffer<f64>, CudaBuffer<f64>)> {
Err(GpuError::NoCudaFeature)
}
#[cfg(feature = "cuda")]
pub fn gpu_qr_f32(
data: &[f32],
m: usize,
n: usize,
device: &GpuDevice,
) -> GpuResult<(Vec<f32>, Vec<f32>)> {
use cudarc::cusolver::sys as csys;
if m == 0 || n == 0 {
return Ok((vec![], vec![]));
}
let k = m.min(n);
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let col_major = transpose_f32(data, m, n);
let mut d_a = upload_f32(&col_major, device)?;
let mut d_tau = crate::transfer::alloc_zeros_f32(k, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSgeqrf_bufferSize(
dn.cu(),
m as i32,
n as i32,
a_ptr as *mut f32,
m as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f32(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (tau_ptr, _tau_sync) = d_tau.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSgeqrf(
dn.cu(),
m as i32,
n as i32,
a_ptr as *mut f32,
m as i32,
tau_ptr as *mut f32,
work_ptr as *mut f32,
lwork,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_qr_f32: geqrf failed",
expected: vec![0],
got: vec![info_val as usize],
});
}
let qr_col = download_f32(&d_a, device)?;
let mut r_host = vec![0.0f32; k * n];
for i in 0..k {
for j in 0..n {
if j >= i {
r_host[i * n + j] = qr_col[j * m + i]; }
}
}
let mut lwork_orgqr: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner().device_ptr(&stream);
let (tau_ptr, _tau_sync) = d_tau.inner().device_ptr(&stream);
csys::cusolverDnSorgqr_bufferSize(
dn.cu(),
m as i32,
k as i32,
k as i32,
a_ptr as *const f32,
m as i32,
tau_ptr as *const f32,
&mut lwork_orgqr,
)
.result()?;
}
let mut d_work2 = crate::transfer::alloc_zeros_f32(lwork_orgqr.max(1) as usize, device)?;
let mut d_info2 = alloc_zeros_i32(1, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (tau_ptr, _tau_sync) = d_tau.inner().device_ptr(&stream);
let (work_ptr, _work_sync) = d_work2.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info2.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSorgqr(
dn.cu(),
m as i32,
k as i32,
k as i32,
a_ptr as *mut f32,
m as i32,
tau_ptr as *const f32,
work_ptr as *mut f32,
lwork_orgqr,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val2 = read_dev_info(&d_info2, device)?;
if info_val2 != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_qr_f32: orgqr failed",
expected: vec![0],
got: vec![info_val2 as usize],
});
}
let q_full_col = download_f32(&d_a, device)?;
let mut q_host = vec![0.0f32; m * k];
for i in 0..m {
for j in 0..k {
q_host[i * k + j] = q_full_col[j * m + i]; }
}
Ok((q_host, r_host))
}
#[cfg(feature = "cuda")]
pub fn gpu_qr_f64(
data: &[f64],
m: usize,
n: usize,
device: &GpuDevice,
) -> GpuResult<(Vec<f64>, Vec<f64>)> {
use cudarc::cusolver::sys as csys;
if m == 0 || n == 0 {
return Ok((vec![], vec![]));
}
let k = m.min(n);
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let col_major = transpose_f64(data, m, n);
let mut d_a = upload_f64(&col_major, device)?;
let mut d_tau = crate::transfer::alloc_zeros_f64(k, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDgeqrf_bufferSize(
dn.cu(),
m as i32,
n as i32,
a_ptr as *mut f64,
m as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f64(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (tau_ptr, _tau_sync) = d_tau.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDgeqrf(
dn.cu(),
m as i32,
n as i32,
a_ptr as *mut f64,
m as i32,
tau_ptr as *mut f64,
work_ptr as *mut f64,
lwork,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_qr_f64: geqrf failed",
expected: vec![0],
got: vec![info_val as usize],
});
}
let qr_col = download_f64(&d_a, device)?;
let mut r_host = vec![0.0f64; k * n];
for i in 0..k {
for j in 0..n {
if j >= i {
r_host[i * n + j] = qr_col[j * m + i]; }
}
}
let mut lwork_orgqr: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner().device_ptr(&stream);
let (tau_ptr, _tau_sync) = d_tau.inner().device_ptr(&stream);
csys::cusolverDnDorgqr_bufferSize(
dn.cu(),
m as i32,
k as i32,
k as i32,
a_ptr as *const f64,
m as i32,
tau_ptr as *const f64,
&mut lwork_orgqr,
)
.result()?;
}
let mut d_work2 = crate::transfer::alloc_zeros_f64(lwork_orgqr.max(1) as usize, device)?;
let mut d_info2 = alloc_zeros_i32(1, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (tau_ptr, _tau_sync) = d_tau.inner().device_ptr(&stream);
let (work_ptr, _work_sync) = d_work2.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info2.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDorgqr(
dn.cu(),
m as i32,
k as i32,
k as i32,
a_ptr as *mut f64,
m as i32,
tau_ptr as *const f64,
work_ptr as *mut f64,
lwork_orgqr,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val2 = read_dev_info(&d_info2, device)?;
if info_val2 != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_qr_f64: orgqr failed",
expected: vec![0],
got: vec![info_val2 as usize],
});
}
let q_full_col = download_f64(&d_a, device)?;
let mut q_host = vec![0.0f64; m * k];
for i in 0..m {
for j in 0..k {
q_host[i * k + j] = q_full_col[j * m + i]; }
}
Ok((q_host, r_host))
}
#[cfg(feature = "cuda")]
pub fn gpu_eigh_f32(
input: &CudaBuffer<f32>,
n: usize,
device: &GpuDevice,
) -> GpuResult<(CudaBuffer<f32>, CudaBuffer<f32>)> {
use cudarc::cusolver::sys as csys;
if n == 0 {
return Ok((
crate::transfer::alloc_zeros_f32(0, device)?,
crate::transfer::alloc_zeros_f32(0, device)?,
));
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let mut d_a = crate::transfer::alloc_zeros_f32(n * n, device)?;
stream.memcpy_dtod(input.inner(), d_a.inner_mut())?;
let mut d_w = crate::transfer::alloc_zeros_f32(n, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let jobz = csys::cusolverEigMode_t::CUSOLVER_EIG_MODE_VECTOR;
let uplo = csys::cublasFillMode_t::CUBLAS_FILL_MODE_UPPER;
let mut lwork: i32 = 0;
unsafe {
csys::cusolverDnSsyevd_bufferSize(
dn.cu(),
jobz,
uplo,
n as i32,
d_a.inner().device_ptr(&stream).0 as *const f32,
n as i32,
d_w.inner().device_ptr(&stream).0 as *const f32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f32(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _) = d_a.inner_mut().device_ptr_mut(&stream);
let (w_ptr, _) = d_w.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSsyevd(
dn.cu(),
jobz,
uplo,
n as i32,
a_ptr as *mut f32,
n as i32,
w_ptr as *mut f32,
work_ptr as *mut f32,
lwork,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_eigh_f32",
expected: vec![0],
got: vec![info_val.unsigned_abs() as usize],
});
}
let v_rm = crate::kernels::gpu_transpose_2d(&d_a, n, n, device)?;
Ok((d_w, v_rm))
}
#[cfg(feature = "cuda")]
pub fn gpu_eigh_f64(
input: &CudaBuffer<f64>,
n: usize,
device: &GpuDevice,
) -> GpuResult<(CudaBuffer<f64>, CudaBuffer<f64>)> {
use cudarc::cusolver::sys as csys;
if n == 0 {
return Ok((
crate::transfer::alloc_zeros_f64(0, device)?,
crate::transfer::alloc_zeros_f64(0, device)?,
));
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let mut d_a = crate::transfer::alloc_zeros_f64(n * n, device)?;
stream.memcpy_dtod(input.inner(), d_a.inner_mut())?;
let mut d_w = crate::transfer::alloc_zeros_f64(n, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let jobz = csys::cusolverEigMode_t::CUSOLVER_EIG_MODE_VECTOR;
let uplo = csys::cublasFillMode_t::CUBLAS_FILL_MODE_UPPER;
let mut lwork: i32 = 0;
unsafe {
csys::cusolverDnDsyevd_bufferSize(
dn.cu(),
jobz,
uplo,
n as i32,
d_a.inner().device_ptr(&stream).0 as *const f64,
n as i32,
d_w.inner().device_ptr(&stream).0 as *const f64,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f64(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _) = d_a.inner_mut().device_ptr_mut(&stream);
let (w_ptr, _) = d_w.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDsyevd(
dn.cu(),
jobz,
uplo,
n as i32,
a_ptr as *mut f64,
n as i32,
w_ptr as *mut f64,
work_ptr as *mut f64,
lwork,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_eigh_f64",
expected: vec![0],
got: vec![info_val.unsigned_abs() as usize],
});
}
let v_rm = crate::kernels::gpu_transpose_2d_f64(&d_a, n, n, device)?;
Ok((d_w, v_rm))
}
#[cfg(feature = "cuda")]
pub fn gpu_eigvalsh_f32(
input: &CudaBuffer<f32>,
n: usize,
device: &GpuDevice,
) -> GpuResult<CudaBuffer<f32>> {
use cudarc::cusolver::sys as csys;
if n == 0 {
return crate::transfer::alloc_zeros_f32(0, device);
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let mut d_a = crate::transfer::alloc_zeros_f32(n * n, device)?;
stream.memcpy_dtod(input.inner(), d_a.inner_mut())?;
let mut d_w = crate::transfer::alloc_zeros_f32(n, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let jobz = csys::cusolverEigMode_t::CUSOLVER_EIG_MODE_NOVECTOR;
let uplo = csys::cublasFillMode_t::CUBLAS_FILL_MODE_UPPER;
let mut lwork: i32 = 0;
unsafe {
csys::cusolverDnSsyevd_bufferSize(
dn.cu(),
jobz,
uplo,
n as i32,
d_a.inner().device_ptr(&stream).0 as *const f32,
n as i32,
d_w.inner().device_ptr(&stream).0 as *const f32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f32(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _) = d_a.inner_mut().device_ptr_mut(&stream);
let (w_ptr, _) = d_w.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSsyevd(
dn.cu(),
jobz,
uplo,
n as i32,
a_ptr as *mut f32,
n as i32,
w_ptr as *mut f32,
work_ptr as *mut f32,
lwork,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_eigvalsh_f32",
expected: vec![0],
got: vec![info_val.unsigned_abs() as usize],
});
}
Ok(d_w)
}
#[cfg(feature = "cuda")]
pub fn gpu_eigvalsh_f64(
input: &CudaBuffer<f64>,
n: usize,
device: &GpuDevice,
) -> GpuResult<CudaBuffer<f64>> {
use cudarc::cusolver::sys as csys;
if n == 0 {
return crate::transfer::alloc_zeros_f64(0, device);
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let mut d_a = crate::transfer::alloc_zeros_f64(n * n, device)?;
stream.memcpy_dtod(input.inner(), d_a.inner_mut())?;
let mut d_w = crate::transfer::alloc_zeros_f64(n, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let jobz = csys::cusolverEigMode_t::CUSOLVER_EIG_MODE_NOVECTOR;
let uplo = csys::cublasFillMode_t::CUBLAS_FILL_MODE_UPPER;
let mut lwork: i32 = 0;
unsafe {
csys::cusolverDnDsyevd_bufferSize(
dn.cu(),
jobz,
uplo,
n as i32,
d_a.inner().device_ptr(&stream).0 as *const f64,
n as i32,
d_w.inner().device_ptr(&stream).0 as *const f64,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f64(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _) = d_a.inner_mut().device_ptr_mut(&stream);
let (w_ptr, _) = d_w.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDsyevd(
dn.cu(),
jobz,
uplo,
n as i32,
a_ptr as *mut f64,
n as i32,
w_ptr as *mut f64,
work_ptr as *mut f64,
lwork,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_eigvalsh_f64",
expected: vec![0],
got: vec![info_val.unsigned_abs() as usize],
});
}
Ok(d_w)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_svd_f32(
_data: &[f32],
_m: usize,
_n: usize,
_device: &GpuDevice,
) -> GpuResult<(Vec<f32>, Vec<f32>, Vec<f32>)> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_svd_f64(
_data: &[f64],
_m: usize,
_n: usize,
_device: &GpuDevice,
) -> GpuResult<(Vec<f64>, Vec<f64>, Vec<f64>)> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_cholesky_f32(_data: &[f32], _n: usize, _device: &GpuDevice) -> GpuResult<Vec<f32>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_cholesky_f64(_data: &[f64], _n: usize, _device: &GpuDevice) -> GpuResult<Vec<f64>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_solve_f32(
_a_data: &[f32],
_b_data: &[f32],
_n: usize,
_nrhs: usize,
_device: &GpuDevice,
) -> GpuResult<Vec<f32>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_solve_f64(
_a_data: &[f64],
_b_data: &[f64],
_n: usize,
_nrhs: usize,
_device: &GpuDevice,
) -> GpuResult<Vec<f64>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_lu_factor_f32(
_a_dev: &CudaBuffer<f32>,
_n: usize,
_device: &GpuDevice,
) -> GpuResult<(CudaBuffer<f32>, CudaBuffer<i32>)> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_lu_factor_f64(
_a_dev: &CudaBuffer<f64>,
_n: usize,
_device: &GpuDevice,
) -> GpuResult<(CudaBuffer<f64>, CudaBuffer<i32>)> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_qr_f32(
_data: &[f32],
_m: usize,
_n: usize,
_device: &GpuDevice,
) -> GpuResult<(Vec<f32>, Vec<f32>)> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_qr_f64(
_data: &[f64],
_m: usize,
_n: usize,
_device: &GpuDevice,
) -> GpuResult<(Vec<f64>, Vec<f64>)> {
Err(GpuError::NoCudaFeature)
}
#[cfg(test)]
#[cfg(feature = "cuda")]
mod tests {
use super::*;
fn device() -> GpuDevice {
GpuDevice::new(0).expect("CUDA device 0")
}
#[test]
fn svd_reconstructs_3x2() {
let dev = device();
let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let (m, n) = (3, 2);
let (u, s, vt) = gpu_svd_f32(&a, m, n, &dev).unwrap();
let k = m.min(n);
assert_eq!(u.len(), m * k);
assert_eq!(s.len(), k);
assert_eq!(vt.len(), k * n);
let mut recon = vec![0.0f32; m * n];
for i in 0..m {
for j in 0..n {
let mut acc = 0.0f32;
for p in 0..k {
acc += u[i * k + p] * s[p] * vt[p * n + j];
}
recon[i * n + j] = acc;
}
}
for i in 0..m * n {
assert!(
(recon[i] - a[i]).abs() < 1e-3,
"SVD reconstruction failed at {i}: {} vs {}",
recon[i],
a[i]
);
}
}
#[test]
fn svd_singular_values_descending() {
let dev = device();
let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let (_, s, _) = gpu_svd_f32(&a, 3, 2, &dev).unwrap();
assert!(s[0] >= s[1], "singular values must be descending");
}
#[test]
fn svd_square_identity() {
let dev = device();
let eye: Vec<f32> = vec![1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0];
let (_, s, _) = gpu_svd_f32(&eye, 3, 3, &dev).unwrap();
for &sv in &s {
assert!((sv - 1.0).abs() < 1e-4, "identity SVD should have all ones");
}
}
#[test]
fn svd_empty() {
let dev = device();
let (u, s, vt) = gpu_svd_f32(&[], 0, 0, &dev).unwrap();
assert!(u.is_empty());
assert!(s.is_empty());
assert!(vt.is_empty());
}
#[test]
fn cholesky_spd_3x3() {
let dev = device();
#[rustfmt::skip]
let a: Vec<f32> = vec![
6.0, 5.0, 1.0,
5.0, 12.0, 5.0,
1.0, 5.0, 6.0,
];
let l = gpu_cholesky_f32(&a, 3, &dev).unwrap();
for i in 0..3 {
for j in (i + 1)..3 {
assert!(
l[i * 3 + j].abs() < 1e-5,
"L[{i},{j}] = {} should be 0",
l[i * 3 + j]
);
}
}
let mut llt = [0.0f32; 9];
for i in 0..3 {
for j in 0..3 {
let mut acc = 0.0f32;
for p in 0..3 {
acc += l[i * 3 + p] * l[j * 3 + p];
}
llt[i * 3 + j] = acc;
}
}
for i in 0..9 {
assert!(
(llt[i] - a[i]).abs() < 1e-3,
"L*L^T[{i}] = {} vs A[{i}] = {}",
llt[i],
a[i]
);
}
}
#[test]
fn cholesky_identity() {
let dev = device();
let eye: Vec<f32> = vec![1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0];
let l = gpu_cholesky_f32(&eye, 3, &dev).unwrap();
for i in 0..3 {
for j in 0..3 {
let expected = if i == j { 1.0 } else { 0.0 };
assert!(
(l[i * 3 + j] - expected).abs() < 1e-5,
"L[{i},{j}] = {} (expected {})",
l[i * 3 + j],
expected
);
}
}
}
#[test]
fn cholesky_empty() {
let dev = device();
let l = gpu_cholesky_f32(&[], 0, &dev).unwrap();
assert!(l.is_empty());
}
#[test]
fn solve_2x2_simple() {
let dev = device();
let a: Vec<f32> = vec![2.0, 1.0, 1.0, 3.0];
let b: Vec<f32> = vec![5.0, 10.0];
let x = gpu_solve_f32(&a, &b, 2, 1, &dev).unwrap();
assert!((x[0] - 1.0).abs() < 1e-3, "x[0] = {} (expected 1.0)", x[0]);
assert!((x[1] - 3.0).abs() < 1e-3, "x[1] = {} (expected 3.0)", x[1]);
}
#[test]
fn solve_identity() {
let dev = device();
let eye: Vec<f32> = vec![1.0, 0.0, 0.0, 1.0];
let b: Vec<f32> = vec![7.0, 11.0];
let x = gpu_solve_f32(&eye, &b, 2, 1, &dev).unwrap();
assert!((x[0] - 7.0).abs() < 1e-4);
assert!((x[1] - 11.0).abs() < 1e-4);
}
#[test]
fn solve_multiple_rhs() {
let dev = device();
let a: Vec<f32> = vec![2.0, 1.0, 1.0, 3.0];
let b: Vec<f32> = vec![5.0, 3.0, 10.0, 7.0]; let x = gpu_solve_f32(&a, &b, 2, 2, &dev).unwrap();
let mut ax = [0.0f32; 4];
for i in 0..2 {
for j in 0..2 {
ax[i * 2 + j] = a[i * 2] * x[j] + a[i * 2 + 1] * x[2 + j];
}
}
for i in 0..4 {
assert!(
(ax[i] - b[i]).abs() < 1e-3,
"A*X[{i}] = {} vs B[{i}] = {}",
ax[i],
b[i]
);
}
}
#[test]
fn solve_empty() {
let dev = device();
let x = gpu_solve_f32(&[], &[], 0, 0, &dev).unwrap();
assert!(x.is_empty());
}
#[test]
fn qr_reconstructs_3x2() {
let dev = device();
let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let (m, n) = (3, 2);
let (q, r) = gpu_qr_f32(&a, m, n, &dev).unwrap();
let k = m.min(n);
assert_eq!(q.len(), m * k);
assert_eq!(r.len(), k * n);
let mut recon = vec![0.0f32; m * n];
for i in 0..m {
for j in 0..n {
let mut acc = 0.0f32;
for p in 0..k {
acc += q[i * k + p] * r[p * n + j];
}
recon[i * n + j] = acc;
}
}
for i in 0..m * n {
assert!(
(recon[i] - a[i]).abs() < 1e-3,
"QR reconstruction failed at {i}: {} vs {}",
recon[i],
a[i]
);
}
}
#[test]
fn qr_orthogonal_q() {
let dev = device();
let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let (m, n) = (3, 2);
let (q, _) = gpu_qr_f32(&a, m, n, &dev).unwrap();
let k = m.min(n);
let mut qtq = vec![0.0f32; k * k];
for i in 0..k {
for j in 0..k {
let mut acc = 0.0f32;
for p in 0..m {
acc += q[p * k + i] * q[p * k + j];
}
qtq[i * k + j] = acc;
}
}
for i in 0..k {
for j in 0..k {
let expected = if i == j { 1.0 } else { 0.0 };
assert!(
(qtq[i * k + j] - expected).abs() < 1e-3,
"Q^T*Q[{i},{j}] = {} (expected {})",
qtq[i * k + j],
expected
);
}
}
}
#[test]
fn qr_r_upper_triangular() {
let dev = device();
let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let (_, r) = gpu_qr_f32(&a, 3, 2, &dev).unwrap();
let k = 2;
let n = 2;
for i in 0..k {
for j in 0..i.min(n) {
assert!(
r[i * n + j].abs() < 1e-4,
"R[{i},{j}] = {} should be 0",
r[i * n + j]
);
}
}
}
#[test]
fn qr_square() {
let dev = device();
let a: Vec<f32> = vec![2.0, 1.0, 0.0, 1.0, 3.0, 1.0, 0.0, 1.0, 2.0];
let (q, r) = gpu_qr_f32(&a, 3, 3, &dev).unwrap();
let k = 3;
let n = 3;
let mut recon = [0.0f32; 9];
for i in 0..3 {
for j in 0..3 {
let mut acc = 0.0f32;
for p in 0..k {
acc += q[i * k + p] * r[p * n + j];
}
recon[i * n + j] = acc;
}
}
for i in 0..9 {
assert!(
(recon[i] - a[i]).abs() < 1e-3,
"QR square reconstruction failed at {i}: {} vs {}",
recon[i],
a[i]
);
}
}
#[test]
fn qr_empty() {
let dev = device();
let (q, r) = gpu_qr_f32(&[], 0, 0, &dev).unwrap();
assert!(q.is_empty());
assert!(r.is_empty());
}
}