use crate::tensor::{TensorHandle, copy_gpu_ref, launch_copy_perpendicular_ref};
use cubecl_core::{Runtime, client::ComputeClient, ir::StorageType, prelude::TensorBinding};
pub fn into_contiguous<R: Runtime>(
client: &ComputeClient<R>,
input: TensorBinding<R>,
dtype: StorageType,
) -> TensorHandle<R> {
let num_elems: usize = input.shape.iter().product();
let handle = client.empty(num_elems * dtype.size());
let output = TensorHandle::new_contiguous(input.shape.to_vec(), handle, dtype);
copy_into(client, input, output.clone().binding(), dtype);
output
}
pub fn into_contiguous_pitched<R: Runtime>(
client: &ComputeClient<R>,
input: TensorBinding<R>,
dtype: StorageType,
) -> TensorHandle<R> {
if input.shape.len() <= 1 {
return into_contiguous(client, input, dtype);
}
let output = TensorHandle::empty(client, input.shape.clone(), dtype);
copy_into(client, input, output.clone().binding(), dtype);
output
}
pub fn copy_into<R: Runtime>(
client: &ComputeClient<R>,
input: TensorBinding<R>,
output: TensorBinding<R>,
dtype: StorageType,
) {
let rank = input.strides.len();
let is_cpu = client.properties().hardware.num_cpu_cores.is_some();
if input.strides[rank - 1] != 1 && is_cpu {
launch_copy_perpendicular_ref(client, input, output, dtype);
} else {
copy_gpu_ref(client, input, output, dtype);
};
}