/***************************************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
*permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this
*list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this
*list of conditions and the following disclaimer in the documentation and/or other
*materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors
*may be used to endorse or promote products derived from this software without specific
*prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
*EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
*OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
*SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
*EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
*SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
*OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
*SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/**
* \file dnn/src/cuda/convolution_helper/kernel.cuh
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "src/cuda/convolution_helper/bias_visitor.cuh"
#include "src/cuda/convolution_helper/config.cuh"
#include "src/cuda/convolution_helper/conv_trait/conv_trait.cuh"
#include "src/cuda/convolution_helper/epilogue.cuh"
namespace megdnn {
namespace cuda {
namespace convolution {
template <typename ConvTrait, typename BiasVisitor, typename Epilogue>
__global__ void convolution_kernel(
const typename ConvTrait::src_dtype* __restrict__ src,
const typename ConvTrait::filter_dtype* __restrict__ filter, BiasVisitor bias,
Epilogue epilogue, typename ConvTrait::Param param, float alpha, float beta) {
static bool constexpr check_bounds = ConvTrait::check_bounds;
using BlockTileIterator = typename ConvTrait::BlockTileIterator;
BlockTileIterator block_iterator;
// determine batch, out_channel, out_height, out_width of current thread
// block
block_iterator.template init_with_param<ConvTrait::Param>(param);
using DataTileCount = typename ConvTrait::DataTileCount;
using FilterTileCount = typename ConvTrait::FilterTileCount;
using DataGlobal2ShareMemVisitor = typename ConvTrait::DataGlobal2ShareMemVisitor;
using FilterGlobal2ShareMemVisitor =
typename ConvTrait::FilterGlobal2ShareMemVisitor;
using smem_storage_dtype = typename ConvTrait::smem_storage_dtype;
extern __shared__ smem_storage_dtype smem[];
smem_storage_dtype* smem_src = smem;
smem_storage_dtype* smem_filter = smem + DataTileCount::smem_tot;
smem_storage_dtype* smem_dst = smem_filter + FilterTileCount::smem_tot;
DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src};
FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter};
if (check_bounds) {
block_iterator.set_remain(src_gl2sh_visitor, filter_gl2sh_visitor);
}
using BlockConsumer = typename ConvTrait::BlockConsumer;
BlockConsumer block_consumer;
block_consumer.init_accumulator();
block_iterator.template iterate_with_param<
typename ConvTrait::InputLayout, typename ConvTrait::KernLayout>(
src, filter, param, src_gl2sh_visitor, filter_gl2sh_visitor,
block_consumer);
using GlobalMemoryWriter = typename ConvTrait::GlobalMemoryWriter;
GlobalMemoryWriter global_memory_writer;
global_memory_writer.init(smem_dst, alpha, beta);
if (check_bounds) {
block_iterator.set_remain(global_memory_writer);
}
bias.move(
block_iterator.block_batch, block_iterator.block_out_channel,
block_iterator.block_out_height, block_iterator.block_out_width);
epilogue.move(
block_iterator.block_batch, block_iterator.block_out_channel,
block_iterator.block_out_height, block_iterator.block_out_width);
global_memory_writer.template write<check_bounds>(bias, epilogue, block_consumer);
}
template <typename ConvTrait, typename BiasVisitor, typename Epilogue>
__global__ void convolution_kernel_precomp_offset(
const typename ConvTrait::src_dtype* __restrict__ src,
const typename ConvTrait::filter_dtype* __restrict__ filter,
const int* __restrict__ offset, BiasVisitor bias, Epilogue epilogue,
typename ConvTrait::Param param, float alpha, float beta) {
static bool constexpr check_bounds = ConvTrait::check_bounds;
using BlockTileIterator = typename ConvTrait::BlockTileIterator;
BlockTileIterator block_iterator;
// determine batch, out_channel, out_height, out_width of current thread
// block
block_iterator.template init_with_param<ConvTrait::Param>(param);
using DataTileCount = typename ConvTrait::DataTileCount;
using FilterTileCount = typename ConvTrait::FilterTileCount;
using DataGlobal2ShareMemVisitor = typename ConvTrait::DataGlobal2ShareMemVisitor;
using FilterGlobal2ShareMemVisitor =
typename ConvTrait::FilterGlobal2ShareMemVisitor;
using smem_storage_dtype = typename ConvTrait::smem_storage_dtype;
extern __shared__ smem_storage_dtype smem[];
smem_storage_dtype* smem_src = smem;
smem_storage_dtype* smem_filter = smem + DataTileCount::smem_tot;
smem_storage_dtype* smem_dst = smem_filter + FilterTileCount::smem_tot;
DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src, offset};
FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter};
if (check_bounds) {
block_iterator.set_remain(src_gl2sh_visitor, filter_gl2sh_visitor);
}
using BlockConsumer = typename ConvTrait::BlockConsumer;
BlockConsumer block_consumer;
block_consumer.init_accumulator();
block_iterator.template iterate_with_param<
typename ConvTrait::InputLayout, typename ConvTrait::KernLayout>(
src, filter, param, src_gl2sh_visitor, filter_gl2sh_visitor,
block_consumer);
using GlobalMemoryWriter = typename ConvTrait::GlobalMemoryWriter;
GlobalMemoryWriter global_memory_writer;
global_memory_writer.init(smem_dst, alpha, beta);
if (check_bounds) {
block_iterator.set_remain(global_memory_writer);
}
bias.move(
block_iterator.block_batch, block_iterator.block_out_channel,
block_iterator.block_out_height, block_iterator.block_out_width);
epilogue.move(
block_iterator.block_batch, block_iterator.block_out_channel,
block_iterator.block_out_height, block_iterator.block_out_width);
global_memory_writer.template write<check_bounds>(bias, epilogue, block_consumer);
}
} // namespace convolution
} // namespace cuda
} // namespace megdnn
// vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}}