megenginelite-sys 1.8.2

/**
 * \file dnn/src/naive/rnn/funcs.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
// #ifndef _RNN_H
// #define _RNN_H
#include "megdnn/oprs.h"
namespace megdnn {
namespace naive {
namespace rnn {

template <typename CellOpr>
void cell_opr_exec(
        _megdnn_tensor_in input, _megdnn_tensor_in weight_ih,
        _megdnn_tensor_in weight_hh, _megdnn_tensor_in bias_ih,
        _megdnn_tensor_in bias_hh, const TensorNDArray& states,
        TensorNDArray& states_new, _megdnn_workspace workspace,
        param::RNNCell::NonlineMode nonline_mode, Handle* handle);

template <typename CellOpr>
size_t cell_opr_get_workspace_in_bytes(
        const TensorLayout& input, const TensorLayout& weight_ih,
        const TensorLayout& weight_hh, const TensorLayout& bias_ih,
        const TensorLayout& bias_hh, const TensorLayout& hx, Handle* handle);

template <typename CellOpr>
size_t get_workspace_in_bytes(
        const TensorLayout& input, const TensorLayout& flatten_weights,
        size_t hidden_size,
        size_t D,  // num_directions
        Handle* handle) {
    size_t seq_len = input.shape[0];
    size_t batch_size = input.shape[1];
    size_t input_size = input.shape[2];
    size_t gate_hidden_size = flatten_weights.shape[0];
    // concat workspace
    TensorLayout direction_output_layout{
            TensorShape{seq_len, batch_size, hidden_size}, input.dtype};
    TensorLayout output_layout{{seq_len, batch_size, D * hidden_size}, input.dtype};
    TensorLayoutArray layer_layouts;
    for (size_t i = 0; i < D; ++i)
        layer_layouts.push_back(direction_output_layout);
    auto concat_opr = handle->create_operator<ConcatForward>();
    concat_opr->param().axis = -1;
    size_t concat_workspace =
            concat_opr->get_workspace_in_bytes(layer_layouts, output_layout);
    // cell workspace
    TensorLayout weight_ih{{gate_hidden_size, input_size}, flatten_weights.dtype};
    TensorLayout D_weight_ih{
            {gate_hidden_size, D * hidden_size}, flatten_weights.dtype};
    TensorLayout weight_hh{{gate_hidden_size, hidden_size}, flatten_weights.dtype};
    TensorLayout bias{{gate_hidden_size}, flatten_weights.dtype};
    TensorLayout hx{{batch_size, hidden_size}, input.dtype};
    TensorLayout cell_input = {{input.shape[1], input.shape[2]}, input.dtype};
    TensorLayout D_cell_input = {{input.shape[1], D * hidden_size}, input.dtype};

    size_t cell_workspace = cell_opr_get_workspace_in_bytes<CellOpr>(
            cell_input, weight_ih, weight_hh, bias, bias, hx, handle);
    size_t D_cell_workspace = cell_opr_get_workspace_in_bytes<CellOpr>(
            D_cell_input, D_weight_ih, weight_hh, bias, bias, hx, handle);

    return std::max(std::max(cell_workspace, D_cell_workspace), concat_workspace);
}

template <class Cell, typename CellOpr>
void exec_internal(
        std::vector<Cell>& cells, _megdnn_tensor_in input, const TensorNDArray& states,
        TensorNDArray& states_new, _megdnn_tensor_out output,
        _megdnn_tensor_out reserve_space, size_t num_layers, size_t D,
        param::RNNCell::NonlineMode nonline_mode, Handle* handle,
        _megdnn_workspace workspace) {
    size_t seq_len = input.layout.shape[0];
    size_t batch_size = input.layout.shape[1];
    size_t input_size = input.layout.shape[2];
    size_t hidden_size = cells[0].weight_hh.layout.shape[1];
    TensorLayout cell_output_layout{
            TensorShape{batch_size, hidden_size}, states[0].layout.dtype};
    TensorLayout cell_first_input_layout{
            TensorShape{batch_size, input_size}, input.layout.dtype};
    TensorLayout cell_input_layout{
            TensorShape{batch_size, D * hidden_size}, input.layout.dtype};
    TensorLayout direction_output_layout{
            TensorShape{seq_len, batch_size, hidden_size}, output.layout.dtype};
    TensorND tmp_output{workspace.raw_ptr, output.layout};
    _megdnn_workspace new_workspace{
            workspace.raw_ptr + tmp_output.layout.span().dist_byte(),
            workspace.size - tmp_output.layout.span().dist_byte()};

    auto cell_opr = handle->create_operator<CellOpr>();
    auto copy_opr = handle->create_operator<TypeCvtForward>();

    // copy states to states_new
    for (size_t i = 0; i < states.size(); ++i)
        copy_opr->exec(states[i], states_new[i]);
    void* reserve_ptr = reserve_space.raw_ptr();

    // layer 1
    for (size_t d = 0; d < D; ++d) {
        size_t cell_idx = d;
        auto& cell = cells[cell_idx];

        TensorNDArray cur_states;
        size_t states_offset = cell_idx * cell_output_layout.span().dist_byte();
        for (size_t i = 0; i < states.size(); ++i) {
            cur_states.push_back(TensorND{
                    static_cast<uint8_t*>(states_new[i].raw_ptr()) + states_offset,
                    cell_output_layout});
        }

        for (size_t i = 0; i < seq_len; ++i) {
            size_t step = d == 0 ? i : seq_len - 1 - i;
            TensorND step_input{
                    static_cast<uint8_t*>(input.raw_ptr()) +
                            step * cell_first_input_layout.span().dist_byte(),
                    cell_first_input_layout};
            TensorND step_output{
                    static_cast<uint8_t*>(output.raw_ptr()) +
                            (step * D) * cell_output_layout.span().dist_byte() +
                            d * cell_output_layout.span().dist_byte() / batch_size,
                    cell_output_layout};
            TensorNDArray tmp_states;
            for (size_t s = 0; s < cur_states.size(); ++s) {
                tmp_states.push_back(TensorND{reserve_ptr, cur_states[s].layout});
                size_t size_in_bytes = cur_states[s].layout.span().dist_byte();
                reserve_ptr = static_cast<uint8_t*>(reserve_ptr) + size_in_bytes;
            }
            cell_opr_exec<CellOpr>(
                    step_input, cell.weight_ih, cell.weight_hh, cell.bias_ih,
                    cell.bias_hh, cur_states, tmp_states, new_workspace, nonline_mode,
                    handle);
            for (size_t s = 0; s < tmp_states.size(); ++s) {
                copy_opr->exec(tmp_states[s], cur_states[s]);
            }
            TensorLayout half_output_layout{
                    TensorShape{hidden_size}, states[0].layout.dtype};
            if (D == 2) {
                for (size_t i = 0; i < batch_size; i++) {
                    TensorND half_cur_states{
                            // output
                            static_cast<uint8_t*>(cur_states[0].raw_ptr()) +
                                    i * half_output_layout.span().dist_byte(),
                            half_output_layout};
                    TensorND half_step_output{
                            static_cast<uint8_t*>(step_output.raw_ptr()) +
                                    i * half_output_layout.span().dist_byte() * 2,
                            half_output_layout};
                    copy_opr->exec(half_cur_states, half_step_output);
                }
            } else
                copy_opr->exec(cur_states[0], step_output);
        }
    }

    for (size_t layer = 1; layer < num_layers; ++layer) {
        for (size_t d = 0; d < D; ++d) {
            size_t cell_idx = layer * D + d;
            auto& cell = cells[cell_idx];

            TensorNDArray cur_states;
            size_t states_offset = cell_idx * cell_output_layout.span().dist_byte();
            for (size_t i = 0; i < states.size(); ++i) {
                cur_states.push_back(TensorND{
                        static_cast<uint8_t*>(states_new[i].raw_ptr()) + states_offset,
                        cell_output_layout});
            }

            for (size_t i = 0; i < seq_len; ++i) {
                size_t step = d == 0 ? i : seq_len - 1 - i;
                TensorND step_input{
                        static_cast<uint8_t*>(output.raw_ptr()) +
                                step * cell_input_layout.span().dist_byte(),
                        cell_input_layout};
                TensorND step_output{
                        static_cast<uint8_t*>(tmp_output.raw_ptr()) +
                                (step * D) * cell_output_layout.span().dist_byte() +
                                d * cell_output_layout.span().dist_byte() / batch_size,
                        cell_output_layout};
                TensorNDArray tmp_states;
                for (size_t s = 0; s < cur_states.size(); ++s) {
                    tmp_states.push_back(TensorND{reserve_ptr, cur_states[s].layout});
                    size_t size_in_bytes = cur_states[s].layout.span().dist_byte();
                    reserve_ptr = static_cast<uint8_t*>(reserve_ptr) + size_in_bytes;
                }
                cell_opr_exec<CellOpr>(
                        step_input, cell.weight_ih, cell.weight_hh, cell.bias_ih,
                        cell.bias_hh, cur_states, tmp_states, new_workspace,
                        nonline_mode, handle);
                // copy states to cur_states
                for (size_t s = 0; s < tmp_states.size(); ++s) {
                    copy_opr->exec(tmp_states[s], cur_states[s]);
                }
                TensorLayout half_output_layout{
                        TensorShape{hidden_size}, states[0].layout.dtype};
                if (D == 2) {
                    for (size_t i = 0; i < batch_size; i++) {
                        TensorND half_cur_states{
                                // output
                                static_cast<uint8_t*>(cur_states[0].raw_ptr()) +
                                        i * half_output_layout.span().dist_byte(),
                                half_output_layout};
                        TensorND half_step_output{
                                static_cast<uint8_t*>(step_output.raw_ptr()) +
                                        i * half_output_layout.span().dist_byte() * 2,
                                half_output_layout};
                        copy_opr->exec(half_cur_states, half_step_output);
                    }
                } else
                    copy_opr->exec(cur_states[0], step_output);
            }
        }
        copy_opr->exec(tmp_output, output);
    }
}

template <class Cell>
size_t get_cells(
        size_t D, size_t num_layers, size_t input_size, size_t hidden_size, bool bias,
        std::vector<Cell>& cells, _megdnn_tensor_in flatten_weights,
        _megdnn_workspace workspace) {
    cells.reserve(D * num_layers);
    void* weight_ptr = flatten_weights.raw_ptr();
    for (size_t layer = 0; layer < num_layers; ++layer) {
        for (size_t d = 0; d < D; ++d) {
            size_t cell_input_size = D * hidden_size;
            if (layer == 0)
                cell_input_size = input_size;
            Cell cell(
                    weight_ptr, hidden_size, cell_input_size, bias,
                    flatten_weights.layout.dtype, workspace);
            weight_ptr =
                    static_cast<uint8_t*>(weight_ptr) + cell.weight_size_in_bytes();
            cells.push_back(cell);
        }
    }
    return cells[0].workspace_size_in_bytes();
}

template <class Cell>
size_t get_inputs_for_exec(
        _megdnn_tensor_in x, _megdnn_tensor_in y,
        const std::vector<TensorNDArray> unfold_hx, _megdnn_tensor_in reserve_space,
        size_t num_layers, size_t D, size_t hidden_size, const std::vector<Cell>& cells,
        TensorNDArray& layer_inputs, TensorNDArray& layer_outputs,
        std::vector<std::vector<TensorNDArray>>& cell_seq_states,
        param::RNNCell::NonlineMode /*nonlineMode*/, _megdnn_workspace workspace) {
    // return used workspace size

    layer_inputs.push_back(x);
    size_t seq_len = x.layout.shape[0];
    size_t batch_size = x.layout.shape[1];
    size_t num_states = cells[0].num_states();
    TensorLayout cell_output_layout{{batch_size, hidden_size}, y.layout.dtype};
    TensorLayout direction_output_layout{
            {seq_len, batch_size, hidden_size}, y.layout.dtype};
    void* workspace_ptr = workspace.raw_ptr;

    // extract intermedia states from reserve space
    for (size_t layer = 0; layer < num_layers; ++layer) {
        TensorND layer_output{workspace_ptr, y.layout};
        workspace_ptr = static_cast<uint8_t*>(workspace_ptr) +
                        layer_output.layout.span().dist_byte();
        for (size_t d = 0; d < D; ++d) {
            cell_seq_states.push_back(std::vector<TensorNDArray>());
            cell_seq_states[cell_seq_states.size() - 1].push_back(
                    {unfold_hx[layer * d]});
            // reverse direction is stored with reversed order of sequence order
            for (size_t i = 0; i < seq_len; ++i) {
                size_t step = i;
                if (d == 1)
                    step = seq_len - i - 1;
                size_t offset = ((layer * D + d) * seq_len + step) *
                                cell_output_layout.span().dist_byte() * num_states;
                TensorNDArray cur_states;
                for (size_t s = 0; s < num_states; ++s) {
                    TensorND h{
                            static_cast<uint8_t*>(reserve_space.raw_ptr()) + offset +
                                    s * cell_output_layout.span().dist_byte(),
                            cell_output_layout};
                    cur_states.push_back(h);
                }
                TensorND hy{
                        static_cast<uint8_t*>(reserve_space.raw_ptr()) + offset,
                        cell_output_layout};
                // states
                cell_seq_states[cell_seq_states.size() - 1].push_back(cur_states);
                // output
                offset = i * D * cell_output_layout.span().dist_byte();
                memcpy(static_cast<uint8_t*>(layer_output.raw_ptr()) + offset,
                       hy.raw_ptr(), hy.layout.span().dist_byte());
            }
        }
        layer_outputs.push_back(layer_output);
        if (layer != num_layers - 1)
            layer_inputs.push_back(layer_output);
    }
    return static_cast<uint8_t*>(workspace_ptr) -
           static_cast<uint8_t*>((void*)workspace.raw_ptr);
}

template <class Cell>
void backward_exec_internal(
        std::vector<Cell>& cells, size_t D, size_t num_layers, size_t input_size,
        bool bias, param::RNNCell::NonlineMode nonlineMode,
        const TensorNDArray& layer_inputs, const TensorNDArray& layer_outputs,
        const std::vector<std::vector<TensorNDArray>>& cell_seq_states,
        _megdnn_tensor_in dy, const TensorNDArray& dhy, _megdnn_tensor_out dx,
        TensorNDArray& dstates, _megdnn_tensor_out dw, Handle* handle,
        _megdnn_workspace workspace) {
    /*
        layer_inputs: array of input of each layer, element 0: [seq_len, batch_size,
       input_size], element others: [seq_len, batch_size, D * hidden_size]
       layer_outputs: array of outputs of each rnn. To access outputs of the cell at
       (layer, d), use layer_outputs[layer]. The shape is [seq_len, batch_size,
       output_size(D*hidden_size)] (in sequence order) cell_seq_states: arrray of states
       of each cell at each step. To access the states of the cell at (layer, d) at
       sequence step (step), use cell_seq_states[layer*D + d][step]
    */
    size_t seq_len = layer_inputs[0].layout.shape[0];
    size_t batch_size = layer_inputs[0].layout.shape[1];
    DType dtype = layer_inputs[0].layout.dtype;
    size_t cell_y_size = layer_outputs[0].layout.shape[2] / D;
    size_t hidden_size = cell_y_size;
    TensorLayout cell_y_layout = {{batch_size, cell_y_size}, dtype};
    void* workspace_ptr = workspace.raw_ptr;

    TensorND layer_output_grad{
            workspace_ptr, {{seq_len, batch_size, D * hidden_size}, dtype}};
    workspace_ptr = static_cast<uint8_t*>(workspace_ptr) +
                    layer_output_grad.layout.span().dist_byte();
    memcpy(layer_output_grad.raw_ptr(), dy.raw_ptr(), dy.layout.span().dist_byte());
    TensorNDArray direction_dx_arr;
    for (size_t i = 0; i < D; ++i) {
        TensorLayout direction_dx_layout{{seq_len, batch_size, hidden_size}, dtype};
        direction_dx_arr.push_back(TensorND(workspace_ptr, direction_dx_layout));
        workspace_ptr = static_cast<uint8_t*>(workspace_ptr) +
                        direction_dx_layout.span().dist_byte();
    }
    TensorNDArray L0_direction_dx_arr;
    for (size_t i = 0; i < D; ++i) {
        TensorLayout direction_dx_layout{{seq_len, batch_size, input_size}, dtype};
        L0_direction_dx_arr.push_back(TensorND(workspace_ptr, direction_dx_layout));
        workspace_ptr = static_cast<uint8_t*>(workspace_ptr) +
                        direction_dx_layout.span().dist_byte();
    }

    std::vector<TensorNDArray> dstates_arr;
    for (size_t layer = 0; layer < num_layers; ++layer) {
        for (size_t d = 0; d < D; ++d) {
            TensorNDArray cell_states;
            cell_states.reserve(dstates.size());
            for (size_t i = 0; i < dstates.size(); ++i) {
                size_t offset = (layer * D + d) * cell_y_layout.span().dist_byte();
                TensorND dhx_cell{
                        static_cast<uint8_t*>(dstates[i].raw_ptr()) + offset,
                        cell_y_layout};
                memcpy(dhx_cell.raw_ptr(),
                       static_cast<uint8_t*>(dhy[i].raw_ptr()) + offset,
                       cell_y_layout.span().dist_byte());
                cell_states.emplace_back(dhx_cell);
            }
            dstates_arr.push_back(cell_states);
        }
    }

    memset(dw.raw_ptr(), 0, dw.layout.span().dist_byte());

    std::vector<Cell> cell_grads;
    size_t used_workspace_size = static_cast<uint8_t*>(workspace_ptr) -
                                 static_cast<uint8_t*>((void*)(workspace.raw_ptr));
    workspace_ptr =
            static_cast<uint8_t*>(workspace_ptr) +
            get_cells(
                    D, num_layers, input_size, hidden_size, bias, cell_grads, dw,
                    Workspace(
                            workspace.raw_ptr + used_workspace_size,
                            workspace.size - used_workspace_size));

    auto add_opr = handle->create_operator<ElemwiseForward>();
    add_opr->param().mode = Elemwise::Mode::ADD;
    auto copy_opr = handle->create_operator<TypeCvtForward>();

    // initialize dx to zero
    memset(dx.raw_ptr(), 0, dx.layout.span().dist_byte());

    // calculate grads
    for (int layer = (int)num_layers - 1; layer >= 0; --layer) {
        for (int d = (int)D - 1; d >= 0; --d) {
            Cell& cell = cells[layer * D + d];
            Cell& cell_grad = cell_grads[layer * D + d];
            size_t input_size = layer_inputs[layer].layout.shape[2];
            const TensorND& x_arr = layer_inputs[layer];
            const TensorND& y_arr = layer_outputs[layer];
            TensorLayout x_layout = {{batch_size, input_size}, dtype};

            // tmp tensors
            void* tmp_workspace_ptr = workspace_ptr;
            TensorND dwi_tmp{tmp_workspace_ptr, cell_grad.weight_ih.layout};
            tmp_workspace_ptr = static_cast<uint8_t*>(tmp_workspace_ptr) +
                                dwi_tmp.layout.span().dist_byte();
            TensorND dwh_tmp{tmp_workspace_ptr, cell_grad.weight_hh.layout};
            tmp_workspace_ptr = static_cast<uint8_t*>(tmp_workspace_ptr) +
                                dwh_tmp.layout.span().dist_byte();
            TensorND dbias_tmp{tmp_workspace_ptr, cell_grad.bias_ih.layout};
            tmp_workspace_ptr = static_cast<uint8_t*>(tmp_workspace_ptr) +
                                dbias_tmp.layout.span().dist_byte();
            size_t used_workspace_size =
                    static_cast<uint8_t*>(tmp_workspace_ptr) -
                    static_cast<uint8_t*>((void*)(workspace.raw_ptr));

            for (size_t i = 0; i < seq_len; ++i) {
                size_t step = i;
                if (d == 0)
                    step = seq_len - i - 1;
                TensorND x{
                        static_cast<uint8_t*>(x_arr.raw_ptr()) +
                                step * x_layout.span().dist_byte(),
                        x_layout},
                        y{static_cast<uint8_t*>(y_arr.raw_ptr()) +
                                  (step * D + d) * cell_y_layout.span().dist_byte(),
                          cell_y_layout};
                const TensorNDArray& cell_states = cell_seq_states[layer * D + d][step];
                TensorNDArray& dstates_new = dstates_arr[layer * D + d];

                TensorND dy_t{
                        static_cast<uint8_t*>(layer_output_grad.raw_ptr()) +
                                (step * D + d) * cell_y_layout.span().dist_byte(),
                        cell_y_layout};
                add_opr->exec({dstates_new[0], dy_t}, dy_t);

                TensorND dx_t;
                if (layer == 0)
                    dx_t = {static_cast<uint8_t*>(L0_direction_dx_arr[d].raw_ptr()) +
                                    step * x_layout.span().dist_byte(),
                            x_layout};
                else
                    dx_t = {static_cast<uint8_t*>(direction_dx_arr[d].raw_ptr()) +
                                    step * x_layout.span().dist_byte(),
                            x_layout};
                TensorNDArray douts = {dy_t};
                for (size_t s = 1; s < dstates_new.size(); ++s)
                    douts.push_back(dstates_new[s]);
                cell.backward(
                        handle, nonlineMode, x, cell_states, y, douts, dx_t,
                        dstates_new, dwi_tmp, dwh_tmp, dbias_tmp,
                        Workspace(
                                workspace.raw_ptr + used_workspace_size,
                                workspace.size - used_workspace_size));
                // add step gradient to overall gradient
                add_opr->exec({dwi_tmp, cell_grad.weight_ih}, cell_grad.weight_ih);
                add_opr->exec({dwh_tmp, cell_grad.weight_hh}, cell_grad.weight_hh);
                add_opr->exec({dbias_tmp, cell_grad.bias_ih}, cell_grad.bias_ih);
                add_opr->exec({dbias_tmp, cell_grad.bias_hh}, cell_grad.bias_hh);
            }
        }
        // add gradient of different directions to layer_output_grad.
        if (layer == 0) {
            for (size_t i = 0; i < D; ++i)
                add_opr->exec({L0_direction_dx_arr[i], dx}, dx);
        } else {
            if (D == 1)
                copy_opr->exec(direction_dx_arr[0], layer_output_grad);
            else {
                for (size_t t = 0; t < seq_len; ++t) {
                    size_t offset = t * D * cell_y_layout.span().dist_byte();
                    for (size_t d = 0; d < D; ++d) {
                        TensorND src{
                                static_cast<uint8_t*>(direction_dx_arr[d].raw_ptr()) +
                                        offset,
                                cell_y_layout};
                        TensorND dst{
                                static_cast<uint8_t*>(layer_output_grad.raw_ptr()) +
                                        offset + d * cell_y_layout.span().dist_byte(),
                                cell_y_layout};
                        copy_opr->exec(src, dst);
                    }
                }
            }
        }
    }
}

}  // namespace rnn
}  // namespace naive
}  // namespace megdnn
// #include "funcs.tpp"
// #endif