megenginelite-sys 1.8.2

/**
 * \file dnn/src/cuda/memory_utils.cuh
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
#if MEGDNN_CC_CUDA
#pragma once
#include "src/cuda/utils.cuh"

namespace megdnn {
namespace cuda {

MEGDNN_DEVICE __forceinline__ void transpose_int8_4x4_impl(
        const int src0, const int src1, const int src2, const int src3, int& dst0,
        int& dst1, int& dst2, int& dst3) {
    int dst01_lo = __byte_perm(src0, src1, 0x5140);
    int dst01_hi = __byte_perm(src0, src1, 0x7362);
    int dst23_lo = __byte_perm(src2, src3, 0x5140);
    int dst23_hi = __byte_perm(src2, src3, 0x7362);
    dst0 = __byte_perm(dst01_lo, dst23_lo, 0x5410);
    dst1 = __byte_perm(dst01_lo, dst23_lo, 0x7632);
    dst2 = __byte_perm(dst01_hi, dst23_hi, 0x5410);
    dst3 = __byte_perm(dst01_hi, dst23_hi, 0x7632);
}

template <uint32_t interleaved, typename vec_type>
struct transpose_int8_interleavedx4;

template <>
struct transpose_int8_interleavedx4<4, int> {
    static constexpr uint32_t interleaved = 4;
    using vec_type = int;
    using Fragment = array_wrapper<int, interleaved>;
    MEGDNN_DEVICE __forceinline__ void operator()(
            const Fragment src, vec_type (&dst)[4]) {
        transpose_int8_4x4_impl(
                src[0], src[1], src[2], src[3], dst[0], dst[1], dst[2], dst[3]);
    }
};

template <>
struct transpose_int8_interleavedx4<8, int2> {
    static constexpr uint32_t interleaved = 8;
    using vec_type = int2;
    using Fragment = array_wrapper<int, interleaved>;
    MEGDNN_DEVICE __forceinline__ void operator()(
            const Fragment src, vec_type (&dst)[4]) {
        transpose_int8_4x4_impl(
                src[0], src[1], src[2], src[3], dst[0].x, dst[1].x, dst[2].x, dst[3].x);
        transpose_int8_4x4_impl(
                src[4], src[5], src[6], src[7], dst[0].y, dst[1].y, dst[2].y, dst[3].y);
    }
};

template <>
struct transpose_int8_interleavedx4<16, int4> {
    static constexpr uint32_t interleaved = 16;
    using vec_type = int4;
    using Fragment = array_wrapper<int, interleaved>;
    MEGDNN_DEVICE __forceinline__ void operator()(
            const Fragment src, vec_type (&dst)[4]) {
        transpose_int8_4x4_impl(
                src[0], src[1], src[2], src[3], dst[0].x, dst[1].x, dst[2].x, dst[3].x);
        transpose_int8_4x4_impl(
                src[4], src[5], src[6], src[7], dst[0].y, dst[1].y, dst[2].y, dst[3].y);
        transpose_int8_4x4_impl(
                src[8], src[9], src[10], src[11], dst[0].z, dst[1].z, dst[2].z,
                dst[3].z);
        transpose_int8_4x4_impl(
                src[12], src[13], src[14], src[15], dst[0].w, dst[1].w, dst[2].w,
                dst[3].w);
    }
};

}  // namespace cuda
}  // namespace megdnn
#endif

// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}