#pragma once
#include <memory>
#include <unordered_map>
#include "megdnn/oprs/base.h"
#include "src/common/algo_base.h"
#include "src/common/utils.h"
#include "src/fallback/handle.h"
#include "src/naive/convolution/opr_impl.h"
namespace megdnn {
enum class AlgoCategory : int32_t {
DIRECT = 0,
IM2COL = 1,
WINOGRAD = 2,
NAIVE = 3,
};
struct ConvAlgoTypePack {
detail::AlgoDataType data_type : 32;
AlgoCategory algo_category : 32;
};
namespace fallback {
class ConvolutionImpl : public naive::ConvolutionForwardImpl {
public:
using naive::ConvolutionForwardImpl::ConvolutionForwardImpl;
using AlgoSelectionStrategy = detail::AlgoSelectionStrategy;
using AlgoDataType = detail::AlgoDataType;
void exec(
_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst,
const PreprocessedFilter*, _megdnn_workspace workspace) override;
void exec_preprocess(
const TensorLayout& src_layout, _megdnn_tensor_in filter,
const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) override;
size_t get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst, const PreprocessedFilter*) override;
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst) override;
size_t get_preprocess_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst) override;
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst) override;
std::vector<Algorithm*> get_all_algorithms_safe(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst) override;
Algorithm* get_algorithm_heuristic(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst, size_t workspace_limit_in_bytes,
const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) override;
struct NCBKernSizeParam {
uint32_t n;
std::array<uint32_t, MAX_SPATIAL_DIM> isz, osz;
CanonizedFilterMeta filter_meta;
DType src_type, filter_type, dst_type;
ptrdiff_t inp_bs, out_bs;
ptrdiff_t inp_s[4], out_s[4];
Param::ComputeMode compute_mode;
size_t nr_threads;
const PreprocessedFilter* preprocessed_filter;
AlgoDataType deduce_algo_data_type() const;
};
struct NCBKernParam : public NCBKernSizeParam {
RefPtr src_ptr;
RefPtr filter_ptr;
RefPtr dst_ptr;
void* workspace_ptr;
size_t workspace_size;
template <typename T>
const T* src() const {
src_type.assert_is_compatible_ctype<T>();
return static_cast<const T*>(src_ptr.get_ptr());
}
template <typename T>
const T* filter() const {
filter_type.assert_is_compatible_ctype<T>();
return static_cast<const T*>(filter_ptr.get_ptr());
}
template <typename T>
T* dst() const {
dst_type.assert_is_compatible_ctype<T>();
return static_cast<T*>(dst_ptr.get_ptr());
}
template <typename T>
T* workspace() const {
return static_cast<T*>(workspace_ptr);
}
template <typename T>
T* dst(size_t batch_id, size_t group_pack_id,
size_t group_pack_size = 1_z) const {
size_t batch_offset = batch_id * out_bs * dst_type.size();
size_t group_offset = group_pack_size * group_pack_id * filter_meta.ocpg *
osz[0] * osz[1] * dst_type.size();
return reinterpret_cast<T*>(
reinterpret_cast<ptrdiff_t>(dst_ptr.get_ptr()) + batch_offset +
group_offset);
}
template <typename T>
const T* src(
size_t batch_id, size_t group_pack_id,
size_t group_pack_size = 1_z) const {
size_t batch_offset = batch_id * inp_bs * src_type.size();
size_t group_offset = group_pack_size * group_pack_id * filter_meta.icpg *
isz[0] * isz[1] * src_type.size();
return reinterpret_cast<T*>(
reinterpret_cast<ptrdiff_t>(src_ptr.get_ptr()) + batch_offset +
group_offset);
}
template <typename T>
const T* filter(size_t group_pack_id, size_t pack_group_size = 1_z) const {
size_t group_offset = pack_group_size * group_pack_id * filter_meta.icpg *
filter_meta.ocpg * filter_meta.spatial[0] *
filter_meta.spatial[1] * filter_type.size();
return reinterpret_cast<T*>(
reinterpret_cast<ptrdiff_t>(filter_ptr.get_ptr()) + group_offset);
}
};
struct NCBKernIndex {
size_t thread_id = 0; CpuNDRange ndrange_id;
};
using ncb_kern_t = thin_function<void(
const NCBKernParam& param, const NCBKernIndex& ncb_index)>;
struct NCBKern {
ncb_kern_t kern; CpuNDRange global_size;
};
class AlgoBase : public Algorithm {
public:
AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::FALLBACK; }
enum class AlgoType : uint32_t {
FB_ALGO = 1 << 0,
FB_NAIVE,
FB_DEFAULT,
};
virtual ~AlgoBase() = default;
virtual bool usable(
const NCBKernSizeParam& param, AlgoSelectionStrategy) const = 0;
virtual size_t get_workspace(const NCBKernSizeParam& param) const = 0;
virtual SmallVector<NCBKern> dispatch_kern(
const NCBKernSizeParam& param) const = 0;
virtual SmallVector<NCBKern> dispatch_preprocess_kern(
const NCBKernSizeParam&) const {
return {};
};
virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const NCBKernSizeParam&) const {
return {};
};
virtual size_t get_preprocess_workspace(const NCBKernSizeParam&) const {
return 0_z;
};
virtual bool is_preferred(const NCBKernSizeParam&) const { return false; }
bool usable_attribute(
const NCBKernSizeParam& param,
AlgoSelectionStrategy algo_selection_strategy,
const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE,
const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) const {
return contain_attribute_all(positive_attr) &&
!contain_attribute_any(negative_attr) &&
usable(param, algo_selection_strategy);
}
virtual ConvAlgoTypePack get_algo_type() const = 0;
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
};
virtual SmallVector<AlgoBase*> get_all_packed_algo();
SmallVector<AlgoBase*> select_algo_type(ConvAlgoTypePack algo_type);
protected:
virtual void exec_with_ncb_kern(const NCBKernParam& param, Algorithm* algo);
virtual void exec_preprocess_with_ncb_kern(
const NCBKernParam& param, Algorithm* algo);
virtual std::vector<Algorithm*> get_all_algorithms_with_ncb(
const NCBKernSizeParam& param);
virtual Algorithm* get_algorithm_heuristic_with_ncb(
const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr);
const char* get_algorithm_set_name() const override;
class AlgoFallback;
class AlgoNaive;
class AlgoDefault;
class AlgoPack;
private:
NCBKernSizeParam m_prev_selected_algo_sizep;
Algorithm* m_prev_selected_algo = nullptr;
Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override;
bool is_naive_algo(ConvolutionImpl::Algorithm* algo);
Algorithm* get_algorithm(
const NCBKernSizeParam& param,
size_t workspace_size = std::numeric_limits<size_t>::max());
NCBKernSizeParam make_ncb_kern_size_param(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter);
NCBKernParam make_ncb_kern_param(
_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace);
SmallVector<AlgoCategory> suggest_algo_category_order(
const NCBKernSizeParam& param) const;
public:
static const AlgoPack& algo_pack();
};
class ConvolutionBackwardDataImpl : public naive::ConvolutionBackwardDataImpl {
public:
using naive::ConvolutionBackwardDataImpl::ConvolutionBackwardDataImpl;
void exec(
_megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad,
_megdnn_workspace workspace) override;
size_t get_workspace_in_bytes(
const TensorLayout& flter, const TensorLayout& diff,
const TensorLayout& grad) override;
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& filter, const TensorLayout& diff,
const TensorLayout& grad) override;
std::vector<Algorithm*> get_all_algorithms_safe(
const TensorLayout& filter, const TensorLayout& diff,
const TensorLayout& grad) override;
Algorithm* get_algorithm_heuristic(
const TensorLayout& filter, const TensorLayout& diff,
const TensorLayout& grad, size_t workspace_limit_in_bytes,
const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) override;
const char* get_algorithm_set_name() const override;
struct NCBKernSizeParam {
uint32_t n;
std::array<uint32_t, MAX_SPATIAL_DIM> isz, osz;
CanonizedFilterMeta filter_meta;
DType diff_type, filter_type, grad_type;
TensorLayout diff_layout, filter_layout, grad_layout;
ptrdiff_t inp_bs, out_bs;
size_t diff_extra_mem_size, filter_extra_mem_size, grad_extra_mem_size;
Param::ComputeMode compute_mode;
};
struct NCBKernParam : public NCBKernSizeParam {
RefPtr filter_ptr;
RefPtr diff_ptr;
RefPtr grad_ptr;
void* workspace_ptr;
size_t workspace_size;
template <typename T>
const T* diff() const {
diff_type.assert_is_compatible_ctype<T>();
return static_cast<const T*>(diff_ptr.get_ptr());
}
template <typename T>
const T* filter() const {
filter_type.assert_is_compatible_ctype<T>();
return static_cast<const T*>(filter_ptr.get_ptr());
}
template <typename T>
T* grad() const {
grad_type.assert_is_compatible_ctype<T>();
return static_cast<T*>(grad_ptr.get_ptr());
}
template <typename T>
T* workspace() const {
return static_cast<T*>(workspace_ptr);
}
};
protected:
using ncb_kern_t = thin_function<void(const NCBKernParam& param)>;
class AlgoBase : public Algorithm {
protected:
~AlgoBase() = default;
public:
AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::FALLBACK; }
enum class AlgoType : uint32_t {
FB_NAIVE = 1 << 0,
FB_DIRECT,
FB_MATMUL,
#if MEGDNN_AARCH64 || MEGDNN_ARMV7
ARM_COMMON_DIRECT_STRD1_DOT_INT8X8X32 = 1 << 8,
ARM_COMMON_DIRECT_STRD2_DOT_INT8X8X32,
ARM_COMMON_DIRECT_STRD1_DOT_QU8,
ARM_COMMON_DIRECT_STRD2_DOT_QU8
#endif
};
virtual bool usable(
ConvolutionBackwardDataImpl* opr,
const NCBKernSizeParam& param) const = 0;
virtual size_t get_workspace(
ConvolutionBackwardDataImpl* opr,
const NCBKernSizeParam& param) const = 0;
virtual ncb_kern_t dispatch_kern(
ConvolutionBackwardDataImpl* opr,
const NCBKernSizeParam& param) const = 0;
bool usable_attribute(
ConvolutionBackwardDataImpl* opr, const NCBKernSizeParam& param,
const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE,
const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) const {
return contain_attribute_all(positive_attr) &&
!contain_attribute_any(negative_attr) && usable(opr, param);
}
virtual bool is_preferred(const NCBKernSizeParam&) const { return false; }
virtual bool is_naive() const { return false; }
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
};
protected:
virtual void exec_with_ncb_kern(const NCBKernParam& param);
virtual size_t get_workspace_with_ncb(const NCBKernSizeParam& param);
virtual std::vector<Algorithm*> get_all_algorithms_with_ncb(
const NCBKernSizeParam& param);
virtual Algorithm* get_algorithm_heuristic_with_ncb(
const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr);
virtual ncb_kern_t ncb_1g_dispatch_kern(
Algorithm* algo, const NCBKernSizeParam& param);
virtual size_t ncb_1g_get_workspace(Algorithm* algo, const NCBKernSizeParam& param);
virtual std::vector<Algorithm*> ncb_1g_get_all_algorithms(
const NCBKernSizeParam& param);
virtual Algorithm* ncb_1g_get_algorithm_heuristic(
const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr);
static bool is_matrix_mul_preferred(const NCBKernSizeParam& param);
virtual SmallVector<AlgoBase*> get_all_packed_algo();
private:
NCBKernSizeParam m_prev_selected_algo_sizep;
Algorithm* m_prev_selected_algo = nullptr;
Algorithm* get_algorithm(const NCBKernSizeParam& param);
NCBKernSizeParam make_ncb_kern_size_param(
const TensorLayout& filter, const TensorLayout& diff,
const TensorLayout& grad);
NCBKernParam make_ncb_kern_param(
_megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad,
_megdnn_workspace workspace);
class AlgoNaive;
class AlgoDirect;
class AlgoMatrixMul;
class AlgoPack;
Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override;
public:
static const AlgoPack& algo_pack();
};
} }
#define UNPACK_CONV_F32_NCB_KERN_SIZES(_p) \
auto N = _p.n, IC = _p.filter_meta.icpg, IH = _p.isz[0], IW = _p.isz[1], \
OC = _p.filter_meta.ocpg, OH = _p.osz[0], OW = _p.osz[1], \
FH = _p.filter_meta.spatial[0], FW = _p.filter_meta.spatial[1], \
SH = _p.filter_meta.stride[0], SW = _p.filter_meta.stride[1], \
PH = _p.filter_meta.padding[0], PW = _p.filter_meta.padding[1]