#![cfg(feature = "cuda")]
use ferrotorch_gpu::kernels::{gpu_transpose_complex_f32, gpu_transpose_complex_f64};
use ferrotorch_gpu::transfer::{cpu_to_gpu, gpu_to_cpu};
use ferrotorch_gpu::{GpuDevice, init_cuda_backend};
fn ensure_init() {
if !ferrotorch_core::gpu_dispatch::has_gpu_backend() {
init_cuda_backend().expect("init_cuda_backend");
}
}
fn expected_repack(input: &[f64], n: usize) -> Vec<f64> {
let mut out = vec![0.0; input.len()];
for k in 0..n * n {
let in_idx = (k % n) * n + (k / n);
out[2 * k] = input[2 * in_idx];
out[2 * k + 1] = input[2 * in_idx + 1];
}
out
}
#[test]
fn transpose_complex_f32_repack_correct() {
ensure_init();
let dev = GpuDevice::new(0).expect("device");
let n = 5;
let host: Vec<f32> = (0..2 * n * n).map(|i| i as f32 + 0.25).collect();
let d_in = cpu_to_gpu(&host, &dev).expect("upload");
let d_out = gpu_transpose_complex_f32(&d_in, n, &dev).expect("kernel must run on-device");
let got = gpu_to_cpu(&d_out, &dev).expect("download");
let host64: Vec<f64> = host.iter().map(|&x| x as f64).collect();
let want = expected_repack(&host64, n);
for k in 0..2 * n * n {
assert!(
(got[k] as f64 - want[k]).abs() < 1e-5,
"f32 mismatch at {k}: got {}, want {}",
got[k],
want[k]
);
}
}
#[test]
fn transpose_complex_f64_repack_correct() {
ensure_init();
let dev = GpuDevice::new(0).expect("device");
let n = 4;
let host: Vec<f64> = (0..2 * n * n).map(|i| i as f64 * 1.5 - 3.0).collect();
let d_in = cpu_to_gpu(&host, &dev).expect("upload");
let d_out = gpu_transpose_complex_f64(&d_in, n, &dev).expect("kernel must run on-device");
let got = gpu_to_cpu(&d_out, &dev).expect("download");
let want = expected_repack(&host, n);
for k in 0..2 * n * n {
assert!(
(got[k] - want[k]).abs() < 1e-12,
"f64 mismatch at {k}: got {}, want {}",
got[k],
want[k]
);
}
}