ndrs 0.3.0

A tensor library with GPU support
#pragma once
#include <cstddef>
#include <cstdint>

#ifdef _WIN32
#define EXPORT __declspec(dllexport)
#else
#define EXPORT __attribute__((visibility("default")))
#endif

extern "C"
{
    // 加法 (float32)
    EXPORT void cpu_strided_add_f32(const float *a, const size_t *a_strides,
                                    const float *b, const size_t *b_strides,
                                    float *c, const size_t *c_strides,
                                    const size_t *shape, int ndim,
                                    size_t total_elements);
    EXPORT int gpu_strided_add_f32(const float *a, const size_t *a_strides,
                                   const float *b, const size_t *b_strides,
                                   float *c, const size_t *c_strides,
                                   const size_t *shape, int ndim,
                                   size_t total_elements, void *stream);
    // 加法 (int32)
    EXPORT void cpu_strided_add_i32(const int32_t *a, const size_t *a_strides,
                                    const int32_t *b, const size_t *b_strides,
                                    int32_t *c, const size_t *c_strides,
                                    const size_t *shape, int ndim,
                                    size_t total_elements);

    EXPORT int gpu_strided_add_i32(const int32_t *a, const size_t *a_strides,
                                   const int32_t *b, const size_t *b_strides,
                                   int32_t *c, const size_t *c_strides,
                                   const size_t *shape, int ndim,
                                   size_t total_elements, void *stream);

    // 跨步拷贝 (通用,字节粒度)
    EXPORT void cpu_strided_copy(const uint8_t *src, size_t src_offset,
                                 const size_t *src_strides, int ndim,
                                 const size_t *shape,
                                 uint8_t *dst, size_t dst_offset,
                                 const size_t *dst_strides,
                                 size_t elem_size, size_t total_elements);

    EXPORT int gpu_strided_copy(const uint8_t *src, size_t src_offset,
                                const size_t *src_strides, int ndim,
                                const size_t *shape,
                                uint8_t *dst, size_t dst_offset,
                                const size_t *dst_strides,
                                size_t elem_size, size_t total_elements,
                                void *stream);

    // 连续化
    EXPORT void cpu_contiguous(const uint8_t *src, size_t src_offset,
                               const size_t *src_strides, int ndim,
                               const size_t *shape,
                               uint8_t *dst, size_t elem_size,
                               size_t total_elements);

    EXPORT int gpu_contiguous(const uint8_t *src, size_t src_offset,
                              const size_t *src_strides, int ndim,
                              const size_t *shape,
                              uint8_t *dst, size_t elem_size,
                              size_t total_elements,
                              void *stream);

    EXPORT void cpu_matmul_strided_f32(
        const float *A, size_t a_stride_row, size_t a_stride_col,
        const float *B, size_t b_stride_row, size_t b_stride_col,
        float *C, size_t c_stride_row, size_t c_stride_col,
        int M, int N, int K);
    EXPORT int gpu_matmul_strided_f32(
        const float *A, size_t a_stride_row, size_t a_stride_col,
        const float *B, size_t b_stride_row, size_t b_stride_col,
        float *C, size_t c_stride_row, size_t c_stride_col,
        int M, int N, int K, void *stream);
}