megenginelite-sys 1.8.2

/**
 * \file dnn/src/cuda/convpooling/conv_pooling_utils.cuh
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
#pragma once
#include <cuda_runtime_api.h>
#include <math.h>
#include <algorithm>
#include "src/cuda/utils.cuh"

//#include "./helper.cuh"

namespace megdnn {
namespace cuda {
namespace conv_pool {

#define CUDA_CHKERR(call)                                                       \
    do {                                                                        \
        cudaError_t code = (call);                                              \
        megdnn_assert(                                                          \
                code == cudaSuccess, "cuda err %d: %s (call %s at %s:%s:%d)",   \
                int(code), cudaGetErrorString(code), #call, __FILE__, __func__, \
                __LINE__);                                                      \
    } while (0)

#define CUDA_CHK_KERN_ERR CUDA_CHKERR(cudaDeviceSynchronize());

static inline int __host__ align_to_warp(int n) {
    int x = n / 32 * 32;
    if (!x)
        x = n;
    return x;
}

// --- Nonline ---
struct Relu {
    static __device__ float apply(float x) { return x > 0 ? x : 0; }
};

struct Sigmoid {
    static __device__ float apply(float x) {
        float exp_value = exp((double)-x);
        return 1 / (1 + exp_value);
    }
};

struct Identity {
    static __device__ float apply(float x) { return x; }
};

// --- Static Reduce ---
template <int size, class Op>
struct StaticReduce {
    static __device__ float apply(const float* val) {
        const int half = size / 2;
        return Op::apply(
                StaticReduce<half, Op>::apply(val),
                StaticReduce<size - half, Op>::apply(val + half));
    }
};

template <class Op>
struct StaticReduce<1, Op> {
    static __device__ float apply(const float* val) { return val[0]; }
};

template <class Op>
struct StaticReduce<2, Op> {
    static __device__ float apply(const float* val) {
        return Op::apply(val[0], val[1]);
    }
};

struct OpAdd {
    static __device__ float apply(float a, float b) { return a + b; }
};

struct OpMax {
    static __device__ float apply(float a, float b) { return max(a, b); }
};

struct IdxGetterConvolution {
    static inline __device__ int apply(int kern, int i, int p) {
        return kern - i - 1 + p;
    }
};

struct IdxGetterCorrRel {
    static inline __device__ int apply(int kern, int i, int p) { return i - p; }
};

// --- Pooling ---
struct MeanPooler {
    template <int pool_shape_h, int pool_shape_w>
    static __device__ float apply(const float* val) {
        const int size = pool_shape_h * pool_shape_w;
        return StaticReduce<size, OpAdd>::apply(val) / size;
    }
};

struct MaxPooler {
    template <int pool_shape_h, int pool_shape_w>
    static __device__ float apply(const float* val) {
        return StaticReduce<pool_shape_h * pool_shape_w, OpMax>::apply(val);
    }
};

// --- Reader ---
class Tex1DReader {
    cudaTextureObject_t m_tex;
    int m_base_offset, m_chl_stride, m_row_stride, m_row_offset;
    // size_t batch_, chal_, height_, weight_;

public:
    // Set attributes of texture Object
    /*__device__ void init(cudaTextureObject_t& tex,
        size_t batch, size_t chal, size_t height, size_t weight) {
        batch_  = batch;
        chal_   = chal;
        height_ = height;
        weight_ = weight;
        m_chl_stride  = height * weight;
        m_row_stride  = weight;
    }

    __device__ void set_pos(cudaTextureObject_t& tex,
        // Current position
        size_t n, size_t c, size_t h, size_t w) {
        m_tex   = tex;
        m_base_offset = ((n * chal_ + c) * height_ + h) * weight_ + w;
    }
    */
    __device__ void set_pos(
            cudaTextureObject_t& tex,
            // Current position
            int chal, int height, int weight, int n, int c, int h, int w) {
        m_chl_stride = height * weight;
        m_row_stride = weight;
        m_tex = tex;
        m_base_offset = ((n * chal + c) * height + h) * weight + w;
    }

    __device__ void reset_row() { m_row_offset = m_base_offset; }

    __device__ void next_row() { m_row_offset += m_row_stride; }

    __device__ void next_channel() { m_base_offset += m_chl_stride; }

    __device__ float get(int /*dr*/, int dc) {
        return tex1Dfetch<float>(m_tex, dc + m_row_offset);
    }

    __device__ float get(int idx) {
        return tex1Dfetch<float>(m_tex, idx + m_base_offset);
    }
};

extern __host__ void create_cuda_tex(
        float* input, cudaTextureObject_t& tex, size_t N, size_t IC, size_t IH,
        size_t IW);

}  // namespace conv_pool
}  // namespace cuda
}  // namespace megdnn
// vim: syntax=cpp.doxygen