ndrs 0.3.0

A tensor library with GPU support
use std::ffi::c_void;

extern "C" {
    pub fn cpu_strided_add_f32(
        a: *const f32,
        a_strides: *const usize,
        b: *const f32,
        b_strides: *const usize,
        c: *mut f32,
        c_strides: *const usize,
        shape: *const usize,
        ndim: i32,
        total_elements: usize,
    );
    pub fn gpu_strided_add_f32(
        a: *const f32,
        a_strides: *const usize,
        b: *const f32,
        b_strides: *const usize,
        c: *mut f32,
        c_strides: *const usize,
        shape: *const usize,
        ndim: i32,
        total_elements: usize,
        stream: *mut c_void,
    ) -> i32;

    pub fn cpu_strided_add_i32(
        a: *const i32,
        a_strides: *const usize,
        b: *const i32,
        b_strides: *const usize,
        c: *mut i32,
        c_strides: *const usize,
        shape: *const usize,
        ndim: i32,
        total_elements: usize,
    );
    pub fn gpu_strided_add_i32(
        a: *const i32,
        a_strides: *const usize,
        b: *const i32,
        b_strides: *const usize,
        c: *mut i32,
        c_strides: *const usize,
        shape: *const usize,
        ndim: i32,
        total_elements: usize,
        stream: *mut c_void,
    ) -> i32;

    pub fn cpu_strided_copy(
        src: *const u8,
        src_offset: usize,
        src_strides: *const usize,
        ndim: i32,
        shape: *const usize,
        dst: *mut u8,
        dst_offset: usize,
        dst_strides: *const usize,
        elem_size: usize,
        total_elements: usize,
    );

    pub fn gpu_strided_copy(
        src: *const u8,
        src_offset: usize,
        src_strides: *const usize,
        ndim: i32,
        shape: *const usize,
        dst: *mut u8,
        dst_offset: usize,
        dst_strides: *const usize,
        elem_size: usize,
        total_elements: usize,
        stream: *mut c_void,
    ) -> i32;

    pub fn cpu_contiguous(
        src: *const u8,
        src_offset: usize,
        src_strides: *const usize,
        ndim: i32,
        shape: *const usize,
        dst: *mut u8,
        elem_size: usize,
        total_elements: usize,
    );

    pub fn gpu_contiguous(
        src: *const u8,
        src_offset: usize,
        src_strides: *const usize,
        ndim: i32,
        shape: *const usize,
        dst: *mut u8,
        elem_size: usize,
        total_elements: usize,
        stream: *mut c_void,
    ) -> i32;

    pub fn cpu_matmul_strided_f32(
        a: *const f32,
        a_stride_row: usize,
        a_stride_col: usize,
        b: *const f32,
        b_stride_row: usize,
        b_stride_col: usize,
        c: *mut f32,
        c_stride_row: usize,
        c_stride_col: usize,
        m: i32,
        n: i32,
        k: i32,
    );
    pub fn gpu_matmul_strided_f32(
        a: *const f32,
        a_stride_row: usize,
        a_stride_col: usize,
        b: *const f32,
        b_stride_row: usize,
        b_stride_col: usize,
        c: *mut f32,
        c_stride_row: usize,
        c_stride_col: usize,
        m: i32,
        n: i32,
        k: i32,
        stream: *mut std::ffi::c_void,
    ) -> i32;
}