#[cfg(feature = "cuda")]
use cudarc::cusolver as cusolver_safe;
#[cfg(feature = "cuda")]
use cudarc::driver::{DevicePtr, DevicePtrMut};
#[cfg(feature = "cuda")]
use crate::buffer::CudaBuffer;
#[cfg(feature = "cuda")]
use crate::device::GpuDevice;
#[cfg(feature = "cuda")]
use crate::error::{GpuError, GpuResult};
#[cfg(not(feature = "cuda"))]
use crate::device::GpuDevice;
#[cfg(not(feature = "cuda"))]
use crate::error::{GpuError, GpuResult};
fn transpose_f32(data: &[f32], m: usize, n: usize) -> Vec<f32> {
let mut out = vec![0.0f32; m * n];
for i in 0..m {
for j in 0..n {
out[j * m + i] = data[i * n + j];
}
}
out
}
fn transpose_f64(data: &[f64], m: usize, n: usize) -> Vec<f64> {
let mut out = vec![0.0f64; m * n];
for i in 0..m {
for j in 0..n {
out[j * m + i] = data[i * n + j];
}
}
out
}
#[cfg(feature = "cuda")]
fn read_dev_info(info_buf: &CudaBuffer<i32>, device: &GpuDevice) -> GpuResult<i32> {
let host = crate::transfer::gpu_to_cpu(info_buf, device)?;
Ok(host[0])
}
#[cfg(feature = "cuda")]
fn alloc_zeros_i32(len: usize, device: &GpuDevice) -> GpuResult<CudaBuffer<i32>> {
crate::transfer::alloc_zeros::<i32>(len, device)
}
#[cfg(feature = "cuda")]
fn upload_f32(data: &[f32], device: &GpuDevice) -> GpuResult<CudaBuffer<f32>> {
crate::transfer::cpu_to_gpu(data, device)
}
#[cfg(feature = "cuda")]
fn download_f32(buf: &CudaBuffer<f32>, device: &GpuDevice) -> GpuResult<Vec<f32>> {
crate::transfer::gpu_to_cpu(buf, device)
}
#[cfg(feature = "cuda")]
fn upload_f64(data: &[f64], device: &GpuDevice) -> GpuResult<CudaBuffer<f64>> {
crate::transfer::cpu_to_gpu(data, device)
}
#[cfg(feature = "cuda")]
fn download_f64(buf: &CudaBuffer<f64>, device: &GpuDevice) -> GpuResult<Vec<f64>> {
crate::transfer::gpu_to_cpu(buf, device)
}
#[cfg(feature = "cuda")]
pub fn gpu_svd_f32(
data: &[f32],
m: usize,
n: usize,
device: &GpuDevice,
) -> GpuResult<(Vec<f32>, Vec<f32>, Vec<f32>)> {
use cudarc::cusolver::sys as csys;
if m == 0 || n == 0 {
return Ok((vec![], vec![], vec![]));
}
let k = m.min(n);
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let col_major = transpose_f32(data, m, n);
let mut d_a = upload_f32(&col_major, device)?;
let mut d_s = crate::transfer::alloc_zeros_f32(k, device)?;
let mut d_u = crate::transfer::alloc_zeros_f32(m * k, device)?;
let mut d_vt = crate::transfer::alloc_zeros_f32(k * n, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
csys::cusolverDnSgesvd_bufferSize(
dn.cu(),
m as i32,
n as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f32(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (s_ptr, _s_sync) = d_s.inner_mut().device_ptr_mut(&stream);
let (u_ptr, _u_sync) = d_u.inner_mut().device_ptr_mut(&stream);
let (vt_ptr, _vt_sync) = d_vt.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSgesvd(
dn.cu(),
b'S' as i8, b'S' as i8, m as i32,
n as i32,
a_ptr as *mut f32,
m as i32, s_ptr as *mut f32,
u_ptr as *mut f32,
m as i32, vt_ptr as *mut f32,
k as i32, work_ptr as *mut f32,
lwork,
std::ptr::null_mut(), info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_svd_f32",
expected: vec![0],
got: vec![info_val as usize],
});
}
let s_host = download_f32(&d_s, device)?;
let u_col = download_f32(&d_u, device)?;
let mut u_host = vec![0.0f32; m * k];
for i in 0..m {
for j in 0..k {
u_host[i * k + j] = u_col[j * m + i];
}
}
let vt_col = download_f32(&d_vt, device)?;
let mut vt_host = vec![0.0f32; k * n];
for i in 0..k {
for j in 0..n {
vt_host[i * n + j] = vt_col[j * k + i];
}
}
Ok((u_host, s_host, vt_host))
}
#[cfg(feature = "cuda")]
pub fn gpu_svd_f64(
data: &[f64],
m: usize,
n: usize,
device: &GpuDevice,
) -> GpuResult<(Vec<f64>, Vec<f64>, Vec<f64>)> {
use cudarc::cusolver::sys as csys;
if m == 0 || n == 0 {
return Ok((vec![], vec![], vec![]));
}
let k = m.min(n);
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let col_major = transpose_f64(data, m, n);
let mut d_a = upload_f64(&col_major, device)?;
let mut d_s = crate::transfer::alloc_zeros_f64(k, device)?;
let mut d_u = crate::transfer::alloc_zeros_f64(m * k, device)?;
let mut d_vt = crate::transfer::alloc_zeros_f64(k * n, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
csys::cusolverDnDgesvd_bufferSize(
dn.cu(),
m as i32,
n as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f64(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (s_ptr, _s_sync) = d_s.inner_mut().device_ptr_mut(&stream);
let (u_ptr, _u_sync) = d_u.inner_mut().device_ptr_mut(&stream);
let (vt_ptr, _vt_sync) = d_vt.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDgesvd(
dn.cu(),
b'S' as i8, b'S' as i8, m as i32,
n as i32,
a_ptr as *mut f64,
m as i32, s_ptr as *mut f64,
u_ptr as *mut f64,
m as i32, vt_ptr as *mut f64,
k as i32, work_ptr as *mut f64,
lwork,
std::ptr::null_mut(), info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_svd_f64",
expected: vec![0],
got: vec![info_val as usize],
});
}
let s_host = download_f64(&d_s, device)?;
let u_col = download_f64(&d_u, device)?;
let mut u_host = vec![0.0f64; m * k];
for i in 0..m {
for j in 0..k {
u_host[i * k + j] = u_col[j * m + i];
}
}
let vt_col = download_f64(&d_vt, device)?;
let mut vt_host = vec![0.0f64; k * n];
for i in 0..k {
for j in 0..n {
vt_host[i * n + j] = vt_col[j * k + i];
}
}
Ok((u_host, s_host, vt_host))
}
#[cfg(feature = "cuda")]
pub fn gpu_cholesky_f32(
data: &[f32],
n: usize,
device: &GpuDevice,
) -> GpuResult<Vec<f32>> {
use cudarc::cusolver::sys as csys;
if n == 0 {
return Ok(vec![]);
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let col_major = transpose_f32(data, n, n);
let mut d_a = upload_f32(&col_major, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSpotrf_bufferSize(
dn.cu(),
csys::cublasFillMode_t::CUBLAS_FILL_MODE_LOWER,
n as i32,
a_ptr as *mut f32,
n as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f32(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSpotrf(
dn.cu(),
csys::cublasFillMode_t::CUBLAS_FILL_MODE_LOWER,
n as i32,
a_ptr as *mut f32,
n as i32,
work_ptr as *mut f32,
lwork,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_cholesky_f32: matrix is not positive-definite",
expected: vec![0],
got: vec![info_val as usize],
});
}
let l_col = download_f32(&d_a, device)?;
let mut l_host = vec![0.0f32; n * n];
for i in 0..n {
for j in 0..n {
l_host[i * n + j] = l_col[j * n + i];
}
}
for i in 0..n {
for j in (i + 1)..n {
l_host[i * n + j] = 0.0;
}
}
Ok(l_host)
}
#[cfg(feature = "cuda")]
pub fn gpu_cholesky_f64(
data: &[f64],
n: usize,
device: &GpuDevice,
) -> GpuResult<Vec<f64>> {
use cudarc::cusolver::sys as csys;
if n == 0 {
return Ok(vec![]);
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let col_major = transpose_f64(data, n, n);
let mut d_a = upload_f64(&col_major, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDpotrf_bufferSize(
dn.cu(),
csys::cublasFillMode_t::CUBLAS_FILL_MODE_LOWER,
n as i32,
a_ptr as *mut f64,
n as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f64(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDpotrf(
dn.cu(),
csys::cublasFillMode_t::CUBLAS_FILL_MODE_LOWER,
n as i32,
a_ptr as *mut f64,
n as i32,
work_ptr as *mut f64,
lwork,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_cholesky_f64: matrix is not positive-definite",
expected: vec![0],
got: vec![info_val as usize],
});
}
let l_col = download_f64(&d_a, device)?;
let mut l_host = vec![0.0f64; n * n];
for i in 0..n {
for j in 0..n {
l_host[i * n + j] = l_col[j * n + i];
}
}
for i in 0..n {
for j in (i + 1)..n {
l_host[i * n + j] = 0.0;
}
}
Ok(l_host)
}
#[cfg(feature = "cuda")]
pub fn gpu_solve_f32(
a_data: &[f32],
b_data: &[f32],
n: usize,
nrhs: usize,
device: &GpuDevice,
) -> GpuResult<Vec<f32>> {
use cudarc::cusolver::sys as csys;
if n == 0 {
return Ok(vec![]);
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let a_col = transpose_f32(a_data, n, n);
let mut d_a = upload_f32(&a_col, device)?;
let b_col = transpose_f32(b_data, n, nrhs);
let mut d_b = upload_f32(&b_col, device)?;
let mut d_ipiv = alloc_zeros_i32(n, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSgetrf_bufferSize(
dn.cu(),
n as i32,
n as i32,
a_ptr as *mut f32,
n as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f32(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (ipiv_ptr, _ipiv_sync) = d_ipiv.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSgetrf(
dn.cu(),
n as i32,
n as i32,
a_ptr as *mut f32,
n as i32,
work_ptr as *mut f32,
ipiv_ptr as *mut i32,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_solve_f32: LU factorization failed (singular matrix)",
expected: vec![0],
got: vec![info_val as usize],
});
}
let mut d_info2 = alloc_zeros_i32(1, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner().device_ptr(&stream);
let (ipiv_ptr, _ipiv_sync) = d_ipiv.inner().device_ptr(&stream);
let (b_ptr, _b_sync) = d_b.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info2.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSgetrs(
dn.cu(),
csys::cublasOperation_t::CUBLAS_OP_N, n as i32,
nrhs as i32,
a_ptr as *const f32,
n as i32,
ipiv_ptr as *const i32,
b_ptr as *mut f32,
n as i32, info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val2 = read_dev_info(&d_info2, device)?;
if info_val2 != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_solve_f32: triangular solve failed",
expected: vec![0],
got: vec![info_val2 as usize],
});
}
let x_col = download_f32(&d_b, device)?;
let mut x_host = vec![0.0f32; n * nrhs];
for i in 0..n {
for j in 0..nrhs {
x_host[i * nrhs + j] = x_col[j * n + i];
}
}
Ok(x_host)
}
#[cfg(feature = "cuda")]
pub fn gpu_solve_f64(
a_data: &[f64],
b_data: &[f64],
n: usize,
nrhs: usize,
device: &GpuDevice,
) -> GpuResult<Vec<f64>> {
use cudarc::cusolver::sys as csys;
if n == 0 {
return Ok(vec![]);
}
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let a_col = transpose_f64(a_data, n, n);
let mut d_a = upload_f64(&a_col, device)?;
let b_col = transpose_f64(b_data, n, nrhs);
let mut d_b = upload_f64(&b_col, device)?;
let mut d_ipiv = alloc_zeros_i32(n, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDgetrf_bufferSize(
dn.cu(),
n as i32,
n as i32,
a_ptr as *mut f64,
n as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f64(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (ipiv_ptr, _ipiv_sync) = d_ipiv.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDgetrf(
dn.cu(),
n as i32,
n as i32,
a_ptr as *mut f64,
n as i32,
work_ptr as *mut f64,
ipiv_ptr as *mut i32,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_solve_f64: LU factorization failed (singular matrix)",
expected: vec![0],
got: vec![info_val as usize],
});
}
let mut d_info2 = alloc_zeros_i32(1, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner().device_ptr(&stream);
let (ipiv_ptr, _ipiv_sync) = d_ipiv.inner().device_ptr(&stream);
let (b_ptr, _b_sync) = d_b.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info2.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDgetrs(
dn.cu(),
csys::cublasOperation_t::CUBLAS_OP_N, n as i32,
nrhs as i32,
a_ptr as *const f64,
n as i32,
ipiv_ptr as *const i32,
b_ptr as *mut f64,
n as i32, info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val2 = read_dev_info(&d_info2, device)?;
if info_val2 != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_solve_f64: triangular solve failed",
expected: vec![0],
got: vec![info_val2 as usize],
});
}
let x_col = download_f64(&d_b, device)?;
let mut x_host = vec![0.0f64; n * nrhs];
for i in 0..n {
for j in 0..nrhs {
x_host[i * nrhs + j] = x_col[j * n + i];
}
}
Ok(x_host)
}
#[cfg(feature = "cuda")]
pub fn gpu_qr_f32(
data: &[f32],
m: usize,
n: usize,
device: &GpuDevice,
) -> GpuResult<(Vec<f32>, Vec<f32>)> {
use cudarc::cusolver::sys as csys;
if m == 0 || n == 0 {
return Ok((vec![], vec![]));
}
let k = m.min(n);
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let col_major = transpose_f32(data, m, n);
let mut d_a = upload_f32(&col_major, device)?;
let mut d_tau = crate::transfer::alloc_zeros_f32(k, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSgeqrf_bufferSize(
dn.cu(),
m as i32,
n as i32,
a_ptr as *mut f32,
m as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f32(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (tau_ptr, _tau_sync) = d_tau.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSgeqrf(
dn.cu(),
m as i32,
n as i32,
a_ptr as *mut f32,
m as i32,
tau_ptr as *mut f32,
work_ptr as *mut f32,
lwork,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_qr_f32: geqrf failed",
expected: vec![0],
got: vec![info_val as usize],
});
}
let qr_col = download_f32(&d_a, device)?;
let mut r_host = vec![0.0f32; k * n];
for i in 0..k {
for j in 0..n {
if j >= i {
r_host[i * n + j] = qr_col[j * m + i]; }
}
}
let mut lwork_orgqr: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner().device_ptr(&stream);
let (tau_ptr, _tau_sync) = d_tau.inner().device_ptr(&stream);
csys::cusolverDnSorgqr_bufferSize(
dn.cu(),
m as i32,
k as i32,
k as i32,
a_ptr as *const f32,
m as i32,
tau_ptr as *const f32,
&mut lwork_orgqr,
)
.result()?;
}
let mut d_work2 = crate::transfer::alloc_zeros_f32(lwork_orgqr.max(1) as usize, device)?;
let mut d_info2 = alloc_zeros_i32(1, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (tau_ptr, _tau_sync) = d_tau.inner().device_ptr(&stream);
let (work_ptr, _work_sync) = d_work2.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info2.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnSorgqr(
dn.cu(),
m as i32,
k as i32,
k as i32,
a_ptr as *mut f32,
m as i32,
tau_ptr as *const f32,
work_ptr as *mut f32,
lwork_orgqr,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val2 = read_dev_info(&d_info2, device)?;
if info_val2 != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_qr_f32: orgqr failed",
expected: vec![0],
got: vec![info_val2 as usize],
});
}
let q_full_col = download_f32(&d_a, device)?;
let mut q_host = vec![0.0f32; m * k];
for i in 0..m {
for j in 0..k {
q_host[i * k + j] = q_full_col[j * m + i]; }
}
Ok((q_host, r_host))
}
#[cfg(feature = "cuda")]
pub fn gpu_qr_f64(
data: &[f64],
m: usize,
n: usize,
device: &GpuDevice,
) -> GpuResult<(Vec<f64>, Vec<f64>)> {
use cudarc::cusolver::sys as csys;
if m == 0 || n == 0 {
return Ok((vec![], vec![]));
}
let k = m.min(n);
let stream = device.stream();
let dn = cusolver_safe::DnHandle::new(stream.clone())?;
let col_major = transpose_f64(data, m, n);
let mut d_a = upload_f64(&col_major, device)?;
let mut d_tau = crate::transfer::alloc_zeros_f64(k, device)?;
let mut d_info = alloc_zeros_i32(1, device)?;
let mut lwork: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDgeqrf_bufferSize(
dn.cu(),
m as i32,
n as i32,
a_ptr as *mut f64,
m as i32,
&mut lwork,
)
.result()?;
}
let mut d_work = crate::transfer::alloc_zeros_f64(lwork.max(1) as usize, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (tau_ptr, _tau_sync) = d_tau.inner_mut().device_ptr_mut(&stream);
let (work_ptr, _work_sync) = d_work.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDgeqrf(
dn.cu(),
m as i32,
n as i32,
a_ptr as *mut f64,
m as i32,
tau_ptr as *mut f64,
work_ptr as *mut f64,
lwork,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val = read_dev_info(&d_info, device)?;
if info_val != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_qr_f64: geqrf failed",
expected: vec![0],
got: vec![info_val as usize],
});
}
let qr_col = download_f64(&d_a, device)?;
let mut r_host = vec![0.0f64; k * n];
for i in 0..k {
for j in 0..n {
if j >= i {
r_host[i * n + j] = qr_col[j * m + i]; }
}
}
let mut lwork_orgqr: i32 = 0;
unsafe {
let (a_ptr, _a_sync) = d_a.inner().device_ptr(&stream);
let (tau_ptr, _tau_sync) = d_tau.inner().device_ptr(&stream);
csys::cusolverDnDorgqr_bufferSize(
dn.cu(),
m as i32,
k as i32,
k as i32,
a_ptr as *const f64,
m as i32,
tau_ptr as *const f64,
&mut lwork_orgqr,
)
.result()?;
}
let mut d_work2 = crate::transfer::alloc_zeros_f64(lwork_orgqr.max(1) as usize, device)?;
let mut d_info2 = alloc_zeros_i32(1, device)?;
unsafe {
let (a_ptr, _a_sync) = d_a.inner_mut().device_ptr_mut(&stream);
let (tau_ptr, _tau_sync) = d_tau.inner().device_ptr(&stream);
let (work_ptr, _work_sync) = d_work2.inner_mut().device_ptr_mut(&stream);
let (info_ptr, _info_sync) = d_info2.inner_mut().device_ptr_mut(&stream);
csys::cusolverDnDorgqr(
dn.cu(),
m as i32,
k as i32,
k as i32,
a_ptr as *mut f64,
m as i32,
tau_ptr as *const f64,
work_ptr as *mut f64,
lwork_orgqr,
info_ptr as *mut i32,
)
.result()?;
}
stream.synchronize()?;
let info_val2 = read_dev_info(&d_info2, device)?;
if info_val2 != 0 {
return Err(GpuError::ShapeMismatch {
op: "gpu_qr_f64: orgqr failed",
expected: vec![0],
got: vec![info_val2 as usize],
});
}
let q_full_col = download_f64(&d_a, device)?;
let mut q_host = vec![0.0f64; m * k];
for i in 0..m {
for j in 0..k {
q_host[i * k + j] = q_full_col[j * m + i]; }
}
Ok((q_host, r_host))
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_svd_f32(
_data: &[f32],
_m: usize,
_n: usize,
_device: &GpuDevice,
) -> GpuResult<(Vec<f32>, Vec<f32>, Vec<f32>)> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_svd_f64(
_data: &[f64],
_m: usize,
_n: usize,
_device: &GpuDevice,
) -> GpuResult<(Vec<f64>, Vec<f64>, Vec<f64>)> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_cholesky_f32(
_data: &[f32],
_n: usize,
_device: &GpuDevice,
) -> GpuResult<Vec<f32>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_cholesky_f64(
_data: &[f64],
_n: usize,
_device: &GpuDevice,
) -> GpuResult<Vec<f64>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_solve_f32(
_a_data: &[f32],
_b_data: &[f32],
_n: usize,
_nrhs: usize,
_device: &GpuDevice,
) -> GpuResult<Vec<f32>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_solve_f64(
_a_data: &[f64],
_b_data: &[f64],
_n: usize,
_nrhs: usize,
_device: &GpuDevice,
) -> GpuResult<Vec<f64>> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_qr_f32(
_data: &[f32],
_m: usize,
_n: usize,
_device: &GpuDevice,
) -> GpuResult<(Vec<f32>, Vec<f32>)> {
Err(GpuError::NoCudaFeature)
}
#[cfg(not(feature = "cuda"))]
pub fn gpu_qr_f64(
_data: &[f64],
_m: usize,
_n: usize,
_device: &GpuDevice,
) -> GpuResult<(Vec<f64>, Vec<f64>)> {
Err(GpuError::NoCudaFeature)
}
#[cfg(test)]
#[cfg(feature = "cuda")]
mod tests {
use super::*;
fn device() -> GpuDevice {
GpuDevice::new(0).expect("CUDA device 0")
}
#[test]
fn svd_reconstructs_3x2() {
let dev = device();
let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let (m, n) = (3, 2);
let (u, s, vt) = gpu_svd_f32(&a, m, n, &dev).unwrap();
let k = m.min(n);
assert_eq!(u.len(), m * k);
assert_eq!(s.len(), k);
assert_eq!(vt.len(), k * n);
let mut recon = vec![0.0f32; m * n];
for i in 0..m {
for j in 0..n {
let mut acc = 0.0f32;
for p in 0..k {
acc += u[i * k + p] * s[p] * vt[p * n + j];
}
recon[i * n + j] = acc;
}
}
for i in 0..m * n {
assert!(
(recon[i] - a[i]).abs() < 1e-3,
"SVD reconstruction failed at {i}: {} vs {}",
recon[i],
a[i]
);
}
}
#[test]
fn svd_singular_values_descending() {
let dev = device();
let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let (_, s, _) = gpu_svd_f32(&a, 3, 2, &dev).unwrap();
assert!(s[0] >= s[1], "singular values must be descending");
}
#[test]
fn svd_square_identity() {
let dev = device();
let eye: Vec<f32> = vec![1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0];
let (_, s, _) = gpu_svd_f32(&eye, 3, 3, &dev).unwrap();
for &sv in &s {
assert!((sv - 1.0).abs() < 1e-4, "identity SVD should have all ones");
}
}
#[test]
fn svd_empty() {
let dev = device();
let (u, s, vt) = gpu_svd_f32(&[], 0, 0, &dev).unwrap();
assert!(u.is_empty());
assert!(s.is_empty());
assert!(vt.is_empty());
}
#[test]
fn cholesky_spd_3x3() {
let dev = device();
#[rustfmt::skip]
let a: Vec<f32> = vec![
6.0, 5.0, 1.0,
5.0, 12.0, 5.0,
1.0, 5.0, 6.0,
];
let l = gpu_cholesky_f32(&a, 3, &dev).unwrap();
for i in 0..3 {
for j in (i + 1)..3 {
assert!(
l[i * 3 + j].abs() < 1e-5,
"L[{i},{j}] = {} should be 0",
l[i * 3 + j]
);
}
}
let mut llt = vec![0.0f32; 9];
for i in 0..3 {
for j in 0..3 {
let mut acc = 0.0f32;
for p in 0..3 {
acc += l[i * 3 + p] * l[j * 3 + p];
}
llt[i * 3 + j] = acc;
}
}
for i in 0..9 {
assert!(
(llt[i] - a[i]).abs() < 1e-3,
"L*L^T[{i}] = {} vs A[{i}] = {}",
llt[i],
a[i]
);
}
}
#[test]
fn cholesky_identity() {
let dev = device();
let eye: Vec<f32> = vec![1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0];
let l = gpu_cholesky_f32(&eye, 3, &dev).unwrap();
for i in 0..3 {
for j in 0..3 {
let expected = if i == j { 1.0 } else { 0.0 };
assert!(
(l[i * 3 + j] - expected).abs() < 1e-5,
"L[{i},{j}] = {} (expected {})",
l[i * 3 + j],
expected
);
}
}
}
#[test]
fn cholesky_empty() {
let dev = device();
let l = gpu_cholesky_f32(&[], 0, &dev).unwrap();
assert!(l.is_empty());
}
#[test]
fn solve_2x2_simple() {
let dev = device();
let a: Vec<f32> = vec![2.0, 1.0, 1.0, 3.0];
let b: Vec<f32> = vec![5.0, 10.0];
let x = gpu_solve_f32(&a, &b, 2, 1, &dev).unwrap();
assert!((x[0] - 1.0).abs() < 1e-3, "x[0] = {} (expected 1.0)", x[0]);
assert!((x[1] - 3.0).abs() < 1e-3, "x[1] = {} (expected 3.0)", x[1]);
}
#[test]
fn solve_identity() {
let dev = device();
let eye: Vec<f32> = vec![1.0, 0.0, 0.0, 1.0];
let b: Vec<f32> = vec![7.0, 11.0];
let x = gpu_solve_f32(&eye, &b, 2, 1, &dev).unwrap();
assert!((x[0] - 7.0).abs() < 1e-4);
assert!((x[1] - 11.0).abs() < 1e-4);
}
#[test]
fn solve_multiple_rhs() {
let dev = device();
let a: Vec<f32> = vec![2.0, 1.0, 1.0, 3.0];
let b: Vec<f32> = vec![5.0, 3.0, 10.0, 7.0]; let x = gpu_solve_f32(&a, &b, 2, 2, &dev).unwrap();
let mut ax = vec![0.0f32; 4];
for i in 0..2 {
for j in 0..2 {
ax[i * 2 + j] = a[i * 2] * x[j] + a[i * 2 + 1] * x[2 + j];
}
}
for i in 0..4 {
assert!(
(ax[i] - b[i]).abs() < 1e-3,
"A*X[{i}] = {} vs B[{i}] = {}",
ax[i],
b[i]
);
}
}
#[test]
fn solve_empty() {
let dev = device();
let x = gpu_solve_f32(&[], &[], 0, 0, &dev).unwrap();
assert!(x.is_empty());
}
#[test]
fn qr_reconstructs_3x2() {
let dev = device();
let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let (m, n) = (3, 2);
let (q, r) = gpu_qr_f32(&a, m, n, &dev).unwrap();
let k = m.min(n);
assert_eq!(q.len(), m * k);
assert_eq!(r.len(), k * n);
let mut recon = vec![0.0f32; m * n];
for i in 0..m {
for j in 0..n {
let mut acc = 0.0f32;
for p in 0..k {
acc += q[i * k + p] * r[p * n + j];
}
recon[i * n + j] = acc;
}
}
for i in 0..m * n {
assert!(
(recon[i] - a[i]).abs() < 1e-3,
"QR reconstruction failed at {i}: {} vs {}",
recon[i],
a[i]
);
}
}
#[test]
fn qr_orthogonal_q() {
let dev = device();
let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let (m, n) = (3, 2);
let (q, _) = gpu_qr_f32(&a, m, n, &dev).unwrap();
let k = m.min(n);
let mut qtq = vec![0.0f32; k * k];
for i in 0..k {
for j in 0..k {
let mut acc = 0.0f32;
for p in 0..m {
acc += q[p * k + i] * q[p * k + j];
}
qtq[i * k + j] = acc;
}
}
for i in 0..k {
for j in 0..k {
let expected = if i == j { 1.0 } else { 0.0 };
assert!(
(qtq[i * k + j] - expected).abs() < 1e-3,
"Q^T*Q[{i},{j}] = {} (expected {})",
qtq[i * k + j],
expected
);
}
}
}
#[test]
fn qr_r_upper_triangular() {
let dev = device();
let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let (_, r) = gpu_qr_f32(&a, 3, 2, &dev).unwrap();
let k = 2;
let n = 2;
for i in 0..k {
for j in 0..i.min(n) {
assert!(
r[i * n + j].abs() < 1e-4,
"R[{i},{j}] = {} should be 0",
r[i * n + j]
);
}
}
}
#[test]
fn qr_square() {
let dev = device();
let a: Vec<f32> = vec![2.0, 1.0, 0.0, 1.0, 3.0, 1.0, 0.0, 1.0, 2.0];
let (q, r) = gpu_qr_f32(&a, 3, 3, &dev).unwrap();
let k = 3;
let n = 3;
let mut recon = vec![0.0f32; 9];
for i in 0..3 {
for j in 0..3 {
let mut acc = 0.0f32;
for p in 0..k {
acc += q[i * k + p] * r[p * n + j];
}
recon[i * n + j] = acc;
}
}
for i in 0..9 {
assert!(
(recon[i] - a[i]).abs() < 1e-3,
"QR square reconstruction failed at {i}: {} vs {}",
recon[i],
a[i]
);
}
}
#[test]
fn qr_empty() {
let dev = device();
let (q, r) = gpu_qr_f32(&[], 0, 0, &dev).unwrap();
assert!(q.is_empty());
assert!(r.is_empty());
}
}