sipp-sys 0.1.1 - Docs.rs

#include "ggml-decoder.h"

#include "ggml-impl.h"
#include "ggml-openvino-extra.h"
#include "ggml-openvino.h"
#include "ggml-quants.h"
#include "ggml.h"
#include "utils.h"

#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <map>
#include <memory>
#include <openvino/core/dimension.hpp>
#include <openvino/core/except.hpp>
#include <openvino/core/node.hpp>
#include <openvino/core/partial_shape.hpp>
#include <openvino/core/type/bfloat16.hpp>
#include <openvino/core/type/element_type.hpp>
#include <openvino/core/type/float16.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/parameter.hpp>
#include <openvino/runtime/tensor.hpp>
#include <ostream>
#include <set>
#include <stdexcept>
#include <string>
#include <vector>

GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
                             ModelParams & model_params,
                             ComputeParams & compute_params,
                             std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
                             bool is_static,
                             bool is_stateful,
                             bool model_is_splitted,
                             bool is_prefill,
                             int prefill_chunk_size) :
    m_is_static(is_static),
    m_is_stateful(is_stateful),
    m_is_prefill(is_prefill),
    m_naive(false),
    m_prefill_chunk_size(prefill_chunk_size),
    m_model_is_splitted(model_is_splitted),
    m_cgraph(cgraph),
    m_model_weights(model_weights),
    m_model_params(model_params),
    m_compute_params(compute_params) {
    static bool printed_address_map = false;
    if (!printed_address_map) {
        if (ggml_openvino_getenv_int("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
            printed_address_map = true;
            print_tensor_address_map(cgraph);
        }
    }

    validate_cgraph();

    set_input_output();
    compute_node_dynamic_dims();
    compute_model_inputs();
    compute_model_outputs();

    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
        m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
        m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node);
    }

    add_extra_inputs();
}

void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) {
    m_cgraph = cgraph;
    m_model_inputs.clear();
    m_model_outputs.clear();
    m_node_info_list.clear();
    set_input_output();
    compute_model_inputs();
    compute_model_outputs();
}

GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
    m_cgraph = cgraph;
    m_model_weights = model_weights;
    m_naive = true;
    set_input_output();
    compute_model_inputs();
    compute_model_outputs();
    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
        m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
        m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node);
    }
}

void GgmlOvDecoder::set_input_output() {
    for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
        auto node = m_cgraph->nodes[node_n];

        NodeInfo current_node_info;
        auto node_name = std::string(node->name);
        auto node_output_name = node_name;
        auto * node_output = node;
        if (node->op == GGML_OP_SET_ROWS) {
            // SET_ROWS updates the tensor in place. For later ov op that uses the
            // the view_src of SET_ROWS, we need to make sure they get the updated tensor
            // by putting the view_src name in the tensor_map in
            // <openvino>/src/frontends/ggml/src/translate_session.cpp
            node_output_name = std::string(node->view_src->name);
            node_output = node->view_src;
        }

        current_node_info.node = node;
        current_node_info.node_name = node_name;
        current_node_info.node_output = node_output;
        current_node_info.node_output_name = node_output_name;
        current_node_info.node_op_case = 0;
        current_node_info.data_addr = node->data;

        for (int i = 0; i < GGML_MAX_SRC; i++) {
            auto * src = node->src[i];
            if (src == nullptr) {
                continue;
            }
            auto src_name = std::string(src->name);
            if (src->flags & GGML_TENSOR_FLAG_INPUT) {
                src_name = get_graph_input_ov_name(src, node);
            }
            current_node_info.node_inputs[src_name] = src;
            current_node_info.node_inputs_names.push_back(src_name);

            if (src->op == GGML_OP_VIEW) {
                // Traverse upward through nested VIEW operations
                std::remove_reference_t<decltype(current_node_info.node_inputs_views[src_name])> view_chain;
                auto current = src;

                while (current != nullptr) {
                    auto current_name = std::string(current->name);
                    if (current->flags & GGML_TENSOR_FLAG_INPUT) {
                        current_name = get_graph_input_ov_name(current, node);
                    }
                    view_chain.emplace_back(current_name, current);
                    // If current src is also a VIEW, continue traversing
                    if (current->src[0] != nullptr && current->src[0]->op == GGML_OP_VIEW) {
                        current = current->src[0];
                    } else {
                        break;
                    }
                }

                // Assign all collected view inputs to node_inputs_views
                current_node_info.node_inputs_views[src_name] = view_chain;
            }
        }

        m_node_info_list.push_back(current_node_info);
    }
}

int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
    int op_case = 0;
    switch (node->op) {
    case GGML_OP_RESHAPE: {
        auto * src = node->src[0];
        if (src->op == GGML_OP_RESHAPE && src->src[0]->ne[0] == node->ne[0] && src->src[0]->ne[1] == node->ne[1]) {
            op_case = 4;
        } else if (node->ne[0] * node->ne[1] == src->ne[0]) {
            op_case = 1;
        } else if (src->ne[0] * src->ne[1] == node->ne[0]) {
            op_case = 2;
            if (src->ne[2] * src->ne[3] == node->ne[1]) {
                op_case = 5;
            }
        } else if (src->ne[0] * src->ne[1] * src->ne[2] == node->ne[1]) {
            op_case = 3;
        } else if (src->ne[1] * src->ne[2] == node->ne[1]) {
            op_case = 6;
        }
        if (op_case == 0 && ggml_nelements(node) == ggml_nelements(src)) {
            op_case = 6;
        }
        break;
    }
    case GGML_OP_PERMUTE: {
        if (node->src[0]->op != GGML_OP_VIEW) {
            op_case = 1;
        } else if (node->src[0]->src[0]->op == GGML_OP_NONE) {
            // kv cache tensor
            std::string src_name(node->view_src->name);
            int layer = extract_layer_from_name(src_name).value();
            if (ggml_is_contiguous(node->src[0])) {
                // -  19: [    64,     8,   256,     1] VIEW            cache_k_l0 (view)             [ 2,   128,  1024, 1048576]
                //         [   512,  1024,     1,     1]      0: NONE     cache_k_l0                    [ 2,  1024, 1048576, 1048576]
                // -  20: [    64,   256,     8,     1] PERMUTE         cache_k_l0 (view) (permuted)  [ 2,  1024,   128, 1048576]
                //         [    64,     8,   256,     1]      0: VIEW     cache_k_l0 (view)             [ 2,   128,  1024, 1048576]
                if (!is_swa_layer(layer)) {
                    op_case = 3;
                } else {
                    op_case = 4;
                }
            } else {
                // special case of cache v when `-fa off`
                // -  17: [   256,     8,    64,     1] VIEW            cache_v_l0 (view)             [ 2, 131072,  2048, 1048576]
                //         [   512,  1024,     1,     1]      0: NONE     cache_v_l0                   [ 2,  1024, 1048576, 1048576]
                // -  18: [   256,    64,     8,     1] PERMUTE         cache_v_l0 (view) (permuted)  [ 2,  2048, 131072, 1048576]
                //         [   256,     8,    64,     1]      0: VIEW     cache_v_l0 (view)            [ 2, 131072,  2048, 1048576]
                if (!is_swa_layer(layer)) {
                    op_case = 5;
                } else {
                    op_case = 6;
                }
            }
        } else {
            // rope'ed query tensor
            op_case = 2;
        }
        break;
    }
    case GGML_OP_MUL_MAT: {
        if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) {
            op_case = 3;
        } else if (node->src[1]->op == GGML_OP_SOFT_MAX) {
            // In the case of `-fa off`, softmax is used, v_trans=true, the dynamic dim is ne[0] for cache_v
            op_case = 2;
        }
        break;
    }
    case GGML_OP_GET_ROWS: {
        if (node->src[1]->op == GGML_OP_VIEW) {
            op_case = 2;
        }
        break;
    }
    case GGML_OP_ROPE: {
        const int mode = node->op_params[2];
        switch (mode) {
        case GGML_ROPE_TYPE_NEOX: {
            op_case = 1;
            break;
        }
        case GGML_ROPE_TYPE_IMROPE: {
            op_case = 2;
            break;
        }
        default:
            op_case = 0;
            break;
        }
        break;
    }
    case GGML_OP_VIEW: {
        if (node->src[0]->op == GGML_OP_VIEW) {
            auto * src = node->src[0];
            if (ggml_nelements(node) != ggml_nelements(src)) {
                // throw std::runtime_error("Unsupported VIEW case");
            }
            op_case = 0;
            if (m_model_is_splitted && m_model_inputs.find(std::string(src->name)) != m_model_inputs.end()) {
                op_case = 0;
            }
        }
        {
            auto * src = node->src[0];
            if (ggml_nelements(node) != ggml_nelements(src)) {
                // Case 4: select one slice on src dim1 (via view offset), keep src dim2 as output dim1.
                // Typical pattern:
                //   src: ne=[N, M, K, 1], nb=[b0, b1, b2, b3]
                //   dst: ne=[N, K, 1, 1], nb=[b0, b2, b3, b3]
                if (node->ne[0] == src->ne[0] && node->ne[1] == src->ne[2] && node->ne[2] == 1 &&
                    node->nb[0] == src->nb[0] && node->nb[1] == src->nb[2] && src->ne[1] > 1) {
                    op_case = 0;
                    break;
                }

                // General case 3: shape differs from source (one or more dims) and is handled as VIEW slicing.
                int diff_count = 0;
                for (int i = 0; i < GGML_MAX_DIMS; i++) {
                    if (node->ne[i] != src->ne[i]) {
                        diff_count++;
                    }
                    // if node ne[i] > src ne[i], case = 0
                    if (node->ne[i] > src->ne[i]) {
                        return 0;
                    }
                }
                if (diff_count >= 1) {
                    op_case = 0;
                }
            }
        }
        break;
    }
    default:
        break;
    }
    return op_case;
}

std::optional<int> extract_layer_from_name(const std::string & name) {
    size_t pos1 = name.find("_l");
    if (pos1 == std::string::npos) {
        return std::nullopt;
    }
    pos1 += 2;
    size_t pos2 = name.find(' ', pos1);
    if (pos2 == std::string::npos) {
        pos2 = name.length();
    }
    std::string layer_str = name.substr(pos1, pos2 - pos1);
    int layer = std::stoi(layer_str);
    return layer;
}

std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgraph * cgraph, bool is_static) {
    ModelParams model_params;
    ComputeParams compute_params;
    auto get_attention_pattern_case = [](const ggml_tensor * node) -> int {
        if (node == nullptr) {
            return -1;
        }

        switch (node->op) {
        case GGML_OP_FLASH_ATTN_EXT:
            if (node->src[0] == nullptr || node->src[1] == nullptr || node->src[3] == nullptr) {
                return -1;
            }
            switch (node->src[1]->op) {
            case GGML_OP_PERMUTE:
                // case 0: node op is FLASH_ATTN_EXT, src 1 not null & op is PERMUTE & the permuted tensor src is the view of cache k
                if (node->src[1]->src[0] != nullptr && node->src[1]->src[0]->op == GGML_OP_VIEW) {
                    return 0;
                }
                break;
            case GGML_OP_CPY:
                // case 1: node op is FLASH_ATTN_EXT, src 1 not null & op is CPY & the copied tensor src is PERMUTE & the permuted tensor src is the view of cache k
                if (node->src[1]->src[0] != nullptr && node->src[1]->src[0]->op == GGML_OP_PERMUTE &&
                    node->src[1]->src[0]->src[0] != nullptr && node->src[1]->src[0]->src[0]->op == GGML_OP_VIEW) {
                    return 1;
                }
                break;
            default:
                break;
            }
            break;
        case GGML_OP_SOFT_MAX:
            // case 2: node op is SOFT_MAX, src 0 not null & op is MUL_MAT & the src 0 of MUL_MAT is PERMUTE & the permuted tensor src is the view of cache k
            if (node->src[0] != nullptr && node->src[1] != nullptr && node->src[0]->op == GGML_OP_MUL_MAT &&
                node->src[0]->src[0] != nullptr && node->src[0]->src[1] != nullptr &&
                node->src[0]->src[0]->op == GGML_OP_PERMUTE && node->src[0]->src[0]->src[0] != nullptr &&
                node->src[0]->src[0]->src[0]->op == GGML_OP_VIEW) {
                return 2;
            }
            // case 3: node op is SOFT_MAX, src 0 not null & op is ADD & the src 0 of ADD is MUL_MAT & the src 0 of MUL_MAT is PERMUTE
            if (node->src[0]->op == GGML_OP_ADD && node->src[0]->src[0] != nullptr &&
                node->src[0]->src[0]->op == GGML_OP_MUL_MAT && node->src[0]->src[0]->src[0] != nullptr &&
                node->src[0]->src[0]->src[0]->op == GGML_OP_PERMUTE) {
                return 3;
            }
            break;
        default:
            break;
        }

        return -1;
    };

    bool rope_seen = false;
    for (int i = 0; i < cgraph->n_nodes; i++) {
        auto * node = cgraph->nodes[i];
        std::string name = std::string(node->name);
        const int attention_pattern_case = get_attention_pattern_case(node);
        if (attention_pattern_case != -1) {
            ggml_tensor * cache_k_permute = nullptr;
            ggml_tensor * mask = nullptr;

            switch (attention_pattern_case) {
            case 0:
                cache_k_permute = node->src[1];
                mask = node->src[3];
                break;
            case 1:
                cache_k_permute = node->src[1]->src[0];
                mask = node->src[3];
                break;
            case 2:
                cache_k_permute = node->src[0]->src[0];
                mask = node->src[1];
                break;
            case 3:
                cache_k_permute = node->src[0]->src[0]->src[0];
                mask = node->src[1];
                break;
            default:
                break;
            }

            assert(cache_k_permute != nullptr);

            model_params.head_size = cache_k_permute->ne[0];
            model_params.n_heads_kv = cache_k_permute->ne[2];
            compute_params.input_len = node->src[0]->ne[1];
            compute_params.token_len_per_seq = node->src[0]->ne[1];

            auto * cache_k_view = cache_k_permute->src[0];
            if (cache_k_view->op != GGML_OP_VIEW || mask == nullptr) {
                continue;
            }

            ggml_tensor * cache_k = cache_k_view->src[0];
            int layer = extract_layer_from_name(cache_k->name).value();

            std::string mask_name(mask->name);

            model_params.kv_buffer_ctx_id = ggml_backend_openvino_buffer_get_ctx_id(cache_k->buffer);
            if (mask_name.find("swa") != std::string::npos) {
                model_params.swa_layers.push_back(layer);
                model_params.ctx_per_seq_swa = cache_k->ne[1];
            } else {
                model_params.ctx_per_seq = cache_k->ne[1];
                model_params.n_seq = cache_k->ne[2];
            }

            compute_params.n_seq_active = mask->ne[3];
            auto seq_size = cache_k->ne[0] * cache_k->ne[1] * ggml_type_size(cache_k->type);
            size_t offset;
            memcpy(&offset, cache_k_view->op_params, sizeof(size_t));
            compute_params.seq_active_start = offset / seq_size;

            if (mask_name.find("swa") != std::string::npos) {
                compute_params.attention_size_swa = mask->ne[0];
            } else {
                compute_params.attention_size = mask->ne[0];
            }
            if (is_static) {
                compute_params.attention_size = model_params.ctx_per_seq;
                compute_params.attention_size_swa = model_params.ctx_per_seq_swa;
                compute_params.token_len_per_seq = 1;
            }
        }

        if (node->op == GGML_OP_MUL_MAT && node->src[0]->op == GGML_OP_PERMUTE &&
            node->src[0]->src[0]->op == GGML_OP_VIEW && is_kvcache(node->src[0]->view_src, node->view_src)) {
            if (node->src[1]->op == GGML_OP_PERMUTE && node->src[1]->src[0]->op == GGML_OP_VIEW &&
                node->src[1]->src[0]->src[0]->op == GGML_OP_ROPE) {
                compute_params.attention_size = node->ne[0];
            }
        }

        // if the node op is TRANSPOSE and its input is PERMUTE and the source of the PERMUTE is VIEW, then get the attention size with the TRANSPOSE node ne[0] (in case no GGML_OP_FLASH_ATTN_EXT)
        if (node->op == GGML_OP_TRANSPOSE && node->src[0]->op == GGML_OP_PERMUTE &&
            node->src[0]->src[0]->op == GGML_OP_VIEW) {
            compute_params.attention_size = node->ne[0];
            if (is_static) {
                compute_params.attention_size = model_params.ctx_per_seq;
            }
        }
        if (node->op == GGML_OP_ROPE) {
            if (compute_params.token_len_per_seq == -1 && node->src[1] != nullptr) {
                compute_params.token_len_per_seq = ggml_nelements(node->src[1]);
            }

            // When multiple ROPE ops in the graph disagree on op_params (e.g. gemma4's
            // mixed SWA/non-SWA layers with different n_dims or freq_base), we cannot
            // share a single precomputed rope_sin/rope_cos. Track divergence so the
            // translator falls back to per-op make_sin_cos in that case.
            static_assert(sizeof(model_params.rope_params) == sizeof(int32_t) * 15, "rope_params size");
            if (!rope_seen) {
                memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15);
                rope_seen = true;
            } else if (memcmp(model_params.rope_params, node->op_params, sizeof(int32_t) * 15) != 0) {
                model_params.mixed_rope_params = true;
            }
        }
    }
    auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1];
    compute_params.output_len = output_tensor->ne[1];
    // for NPU, output_len is always 1 except for llama-perplexity
    if (is_static && compute_params.output_len == 0) {
        compute_params.output_len = 1;
    }
    model_params.ctx = model_params.ctx_per_seq * model_params.n_seq;
    return {model_params, compute_params};
}

void GgmlOvDecoder::validate_cgraph() const {
    if (m_model_params.n_seq > 1 && m_is_static == true) {
        throw std::runtime_error("n_seq > 1 is not supported on NPU. Try setting -np 1.");
    }
}

ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op,
                                                      const ggml_tensor * input,
                                                      int dynamic_dim_index) const {
    if (m_naive) {
        return input != nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)};
    }
    auto name = std::string(input->name);
    ov::PartialShape input_shape;

    if (is_inp_tok(input, op) || is_inp_pos(input, op)) {
        // tokens or positions
        int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
        input_shape = ov::PartialShape{1, 1, 1, len};

    } else if (is_output_idx(input, op)) {
        // output index
        input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1};

    } else if (is_inp_mask(input, op)) {
        // mask
        if (m_is_static) {
            input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx};
        } else if (m_is_stateful) {
            input_shape = ov::PartialShape{1, 1, -1, -1};
        } else {
            input_shape = ov::PartialShape{-1, 1, -1, -1};
        }

    } else if (is_kvcache(input, op)) {
        // kvcache
        input_shape = ov::PartialShape{get_shape(input)};
        if (!m_is_static) {
            // do not fix ctx size to make llama-bench work across test params
            input_shape[2] = -1;
        }
        if (is_stateful()) {
            // Convert stateless KV cache layout [1, 1, seq, n_heads_kv * head_size]
            // to stateful layout [1, seq, n_heads_kv, head_size].
            assert(input_shape.size() == 4 && input_shape[0] == 1 && input_shape[1] == 1 &&
                   input_shape[2].is_dynamic() &&
                   input_shape[3] == (m_model_params.n_heads_kv * m_model_params.head_size));
            input_shape = {input_shape[0], ov::Dimension::dynamic(), m_model_params.n_heads_kv,
                           m_model_params.head_size};
        }

    } else if (is_kv_idx(input, op)) {
        // kv update index
        int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
        input_shape = ov::PartialShape{1, 1, 1, len};

    } else {
        input_shape = ov::PartialShape{get_shape(input)};
    }
    if (dynamic_dim_index != -1 && m_model_is_splitted) {
        input_shape[3 - dynamic_dim_index] = -1;
    }
    if (op->op == GGML_OP_SOFT_MAX && op->src[1] != nullptr && op->src[1]->op == GGML_OP_NONE &&
        op->src[1]->flags & GGML_TENSOR_FLAG_INPUT && op->src[1] == input) {
        // for softmax input mask, the shape is [1, 1, seq_active, seq_active], where seq_active is determined by the input active sequence length instead of the kv cache sequence length
        input_shape[2] = -1;
        input_shape[3] = -1;
    }
    return input_shape;
}

void GgmlOvDecoder::add_extra_inputs() {
    // Extra inputs:
    // 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned,
    //     see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
    // 2. `n_seq_active` and `seq_active_start`, used in FLASH_ATTN_EXT to indicate the active sequences in the batch

    auto create_1d_input = [this](const std::string & name, int64_t value) {
        if (m_is_static) {
            auto constant =
                std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{value});
            constant->set_friendly_name(name);
            m_model_extra_inputs[name] = constant;
        } else {
            auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
            param_node->set_friendly_name(name);
            param_node->output(0).get_tensor().set_names({name});
            m_model_extra_inputs[name] = param_node;

            auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
            *tensor->data<int64_t>() = value;
            m_model_extra_input_values[name] = tensor;
        }
    };

    if (m_compute_params.attention_size != -1) {
        create_1d_input("attention_size", m_compute_params.attention_size);
    }
    if (m_compute_params.attention_size_swa != -1) {
        create_1d_input("attention_size_swa", m_compute_params.attention_size_swa);
    }
    create_1d_input("n_seq_active", m_compute_params.n_seq_active);
    create_1d_input("seq_active_start", m_compute_params.seq_active_start);
    create_1d_input("seq_active_end", m_compute_params.seq_active_start + m_compute_params.n_seq_active);
    if (m_compute_params.token_len_per_seq != -1) {
        create_1d_input("token_len_per_seq", m_compute_params.token_len_per_seq);
    }
    // create_1d_input("token_len", m_compute_params.token_len_per_seq * m_compute_params.n_seq_active);
}

bool GgmlOvDecoder::node_is_used_as_src(const int node_idx) {
    ggml_tensor * node = m_cgraph->nodes[node_idx];
    for (int i = node_idx; i < m_cgraph->n_nodes; i++) {
        ggml_tensor * other_node = m_cgraph->nodes[i];
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            if (other_node->src[j] == node) {
                return true;
            }
        }
    }
    return false;
}

void GgmlOvDecoder::compute_model_inputs() {
    m_model_inputs.clear();
    m_inputs.clear();
    for (int i = 0; i < m_cgraph->n_nodes; i++) {
        ggml_tensor * node = m_cgraph->nodes[i];
        // the node op is NONE means this node maybe as input of later nodes, we should add it to model inputs for this node.
        if (node->op == GGML_OP_NONE && node_is_used_as_src(i)) {
            std::string node_name(node->name);
            if (m_model_weights.find(node_name) == m_model_weights.end()) {
                m_inputs[node_name] = node;
                auto param_node = std::make_shared<ov::op::v0::Parameter>(
                    get_ov_type(node), get_graph_input_shape(node, nullptr, m_node_dynamic_dims[node]));
                param_node->set_friendly_name(node_name);
                param_node->output(0).get_tensor().set_names({node_name});
                m_model_inputs[node_name] = param_node;
            }
            continue;
        }
        for (int i = 0; i < GGML_MAX_SRC; i++) {
            auto * src = node->src[i];
            if (src == nullptr) {
                continue;
            }
            std::string src_name = std::string(src->name);
            if (src->flags & GGML_TENSOR_FLAG_INPUT) {
                src_name = get_graph_input_ov_name(src, node);
            }
            if (m_model_weights.find(src_name) != m_model_weights.end()) {
                continue;
            }

            bool is_intermediate_node = false;
            for (const auto & node_info : m_node_info_list) {
                if (node_info.node == src) {
                    is_intermediate_node = true;
                    break;
                }
            }
            if (is_intermediate_node) {
                continue;
            }
            if (m_model_inputs.find(src_name) != m_model_inputs.end()) {
                continue;
            }

            m_inputs[src_name] = src;

            ggml_backend_buffer * buffer = src->buffer;
            // GGML_BACKEND_BUFFER_USAGE_ANY are kv caches
            if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
                if (auto it = std::find(m_model_params.kv_names.begin(), m_model_params.kv_names.end(), src_name);
                    it == m_model_params.kv_names.end()) {
                    m_model_params.kv_names.push_back(src_name);
                }
            }
            // Resolve nested VIEW nodes by following src[0] until the first non-VIEW tensor.
            while (src->op == GGML_OP_VIEW && src->src[0] != nullptr) {
                src = src->src[0];
                src_name = std::string(src->name);
            }
            m_inputs[src_name] = src;
            ov::PartialShape param_shape = get_graph_input_shape(node, src, m_node_dynamic_dims[src]);
            auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), param_shape);
            param_node->set_friendly_name(src_name);
            param_node->output(0).get_tensor().set_names({src_name});
            m_model_inputs[src_name] = param_node;
        }
    }
}

void GgmlOvDecoder::compute_model_outputs() {
    m_model_outputs.clear();
    m_model_output_names.clear();
    for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
        auto * cur_node = m_cgraph->nodes[node_n];
        // if the node op is NONE means this node is not used at all, we can skip it directly without adding to model outputs.
        if (cur_node->op == GGML_OP_NONE || cur_node->op == GGML_OP_VIEW || cur_node->op == GGML_OP_RESHAPE) {
            continue;
        }
        auto cur_node_use_count = m_cgraph->use_counts[ggml_hash_find(&m_cgraph->visited_hash_set, cur_node)];
        if (cur_node_use_count == 0) {
            // The output of SET_ROWS is the view_src tensor, which is updated in place. We should use the view_src name as the output name to make sure it can be correctly matched with the later ops that use the view_src.
            if (cur_node != nullptr && cur_node->op == GGML_OP_SET_ROWS) {
                cur_node = cur_node->view_src;
            }
        } else {
            int input_use_count = 0;
            for (int i = 0; i < m_cgraph->n_nodes; i++) {
                ggml_tensor * node = m_cgraph->nodes[i];
                for (int j = 0; j < GGML_MAX_SRC; j++) {
                    if (node->src[j] != NULL && node->src[j] == cur_node) {
                        input_use_count++;
                    }
                }
            }
            if (input_use_count == cur_node_use_count) {
                cur_node = nullptr;
            }
        }
        if (cur_node != nullptr) {
            std::string node_output_name(cur_node->name);
            m_model_outputs[node_output_name] = cur_node;
            m_model_output_names.push_back(node_output_name);
        }
    }
}

const ggml_tensor * GgmlOvDecoder::get_tensor_used_op(const ggml_tensor * tensor) const {
    if (tensor == nullptr) {
        return nullptr;
    }
    for (int i = 0; i < m_cgraph->n_nodes; i++) {
        const auto * node = m_cgraph->nodes[i];
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            if (node->src[j] == tensor) {
                return node;
            }
        }
    }
    return nullptr;
}

const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name) const {
    for (int i = 0; i < m_cgraph->n_nodes; i++) {
        const auto * node = m_cgraph->nodes[i];
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            const auto * src = node->src[j];
            if (src == nullptr) {
                break;
            }
            if (std::string(src->name) == name) {
                return src;
            }
        }
    }
    return nullptr;
}

std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const {
    std::map<std::string, std::string> kv_param_res_names;
    for (const auto & name : m_model_params.kv_names) {
        kv_param_res_names[name] = name;
    }
    return kv_param_res_names;
}

std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
    std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
    auto * nodes = cgraph->nodes;
    auto n_nodes = cgraph->n_nodes;
    for (int node_i = 0; node_i < n_nodes; node_i++) {
        auto * node = nodes[node_i];
        for (int i = 0; i < GGML_MAX_SRC; i++) {
            auto * src = node->src[i];
            if (src == nullptr) {
                continue;
            }

            std::string src_name(src->name);
            if (is_rope_freqs_weight(src, node)) {
                src_name = "rope_freqs.weight";
            }
            if (!src->view_src) {
                ggml_backend_buffer * buffer = src->buffer;
                if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) {
                    if (model_weights.find(src_name) == model_weights.end()) {
                        auto weight_node = create_weight_node(src, naive);
                        weight_node->set_friendly_name(src_name);
                        model_weights[src_name] = weight_node;
                    }
                }
            }
        }
    }
    return model_weights;
}

std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, bool naive) {
    const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer);

    // Check if we have a pre-built constant from the OpenVINO backend buffer
    // This is set during ggml_backend_openvino_buffer_set_tensor
    if (tensor->extra) {
        OPENVINO_ASSERT(is_ov_buffer, "Unsupported weight tensor: " + std::string(tensor->name) +
                                          " Possibly this is a cpu backend repacked quantized weights");
        // Cast to our extra base type and check the type
        auto * extra_base = static_cast<ggml_openvino_extra_base *>(tensor->extra);

        if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT) {
            // F16/F32/BF16 weight with shared-memory constant
            auto * weight_extra = static_cast<ggml_openvino_weight_extra *>(tensor->extra);
            if (weight_extra->weight_node) {
                // GGML_LOG_DEBUG("%s: using pre-built weight node for %s\n", __func__, tensor->name);
                return weight_extra->weight_node;
            }
        } else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) {
            // Quantized weight with pre-extracted data
            auto * quant_extra = static_cast<ggml_openvino_quantized_weight_extra *>(tensor->extra);
            if (quant_extra->weight_node) {
                // GGML_LOG_DEBUG("%s: using pre-extracted quantized weight node for %s\n", __func__, tensor->name);
                return quant_extra->weight_node;
            }
        }
    }

    // MUL_MAT_ID expert weights are 3D GGML tensors [k, m, n_expert].
    // Keep the full reversed 4D shape when materializing non-quantized constants,
    // otherwise the expert dimension is collapsed and later Gather/MatMul logic
    // only sees a single expert slice.
    if (!ggml_is_quantized(tensor->type) && (tensor->ne[2] > 1 || tensor->ne[3] > 1)) {
        auto weight_tensor = ov::Tensor(get_ov_type(tensor), get_shape(tensor), tensor->data);
        auto weight_node = std::make_shared<ov::op::v0::Constant>(weight_tensor);
        weight_node->set_friendly_name(tensor->name);
        return weight_node;
    }

    // There are three cases where we need to create a new weight node:
    // 1. weights are in openvino_host_buffer. Weight loading to host buffer will not trigger backend_buffer_set_tensor
    // 2. weights are in cpu/cpu_mapped buffer. On token_embd.weight goes to case 1 or 2, depending on whether mmap or direct_io is used
    // 3. test-backend-ops. buffers in test-backend-ops does not set USAGE_WEIGHT so backend_buffer_set_tensor will not create weight node

    // GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
    static const std::set<ggml_type> weight_types = {GGML_TYPE_F32,  GGML_TYPE_F16,  GGML_TYPE_BF16, GGML_TYPE_Q8_0,
                                                     GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1, GGML_TYPE_Q4_K,
                                                     GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
    if (weight_types.find(tensor->type) == weight_types.end()) {
        throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " +
                                 ggml_type_name(tensor->type));
    }

    OvWeight ov_weight;
    if (ggml_is_quantized(tensor->type)) {
        auto use_bias = naive;
        if (is_ov_buffer) {
            // For quantized weights, copy raw data to a temp buffer first because
            // process_weight_tensor reads from data and writes extracted results
            // (weights/scales/zp) to output_base_ptr — they would overlap if both
            // point to tensor->data.
            size_t raw_size = ggml_nbytes(tensor);
            std::vector<uint8_t> tmp(raw_size);
            memcpy(tmp.data(), tensor->data, raw_size);
            ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data, use_bias);
        } else {
            ov_weight = process_weight_tensor(tensor, tensor->data, nullptr, use_bias);
        }
    } else {
        // For non-quantized weights (F16/F32/BF16), data is already in tensor->data.
        // process_weight_tensor will create an ov::Tensor wrapping tensor->data directly.
        ov_weight = process_weight_tensor(tensor, tensor->data, tensor->data);
    }

    ov_weight.weight_node->set_friendly_name(tensor->name);
    if (!is_ov_buffer) {
        return ov_weight.weight_node;
    }

    ggml_openvino_extra_base * extra;
    if (ov_weight.is_quantized()) {
        extra = new ggml_openvino_quantized_weight_extra(std::move(ov_weight.weights), std::move(ov_weight.scales),
                                                         std::move(ov_weight.zp), ov_weight.weight_node);
    } else {
        extra = new ggml_openvino_weight_extra(std::move(ov_weight.weights), ov_weight.weight_node);
    }
    ggml_openvino_buffer_register_extra(tensor, extra);

    return ov_weight.weight_node;
}

void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) {
    std::ofstream file(filename);
    if (!file.is_open()) {
        std::cerr << "Failed to open file" << std::endl;
        return;
    }

    file << "=== GRAPH ===\n";

    // clang-format off
    file << "n_nodes = " << cgraph->n_nodes << "\n";
    file << " " << std::setw(3) << "nodes"
                <<  std::setw(15) << "shape"
                << std::setw(20) << "op"
                << std::setw(20) << "name"
                << std::setw(3) << "    "
                << std::setw(62) << "stride"
                << std::setw(20) << "buffer_type"
                << "\n";
    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_tensor * node = cgraph->nodes[i];

        // Get buffer type name
        const char * buf_name = "none";
        ggml_backend_buffer_t buf = node->view_src ? node->view_src->buffer : node->buffer;
        if (buf) {
            buf_name = ggml_backend_buffer_name(buf);
        }

        file << " - " << std::setw(3) << i << ": [ "
             << std::setw(5) << node->ne[0] << ", "
             << std::setw(5) << node->ne[1] << ", "
             << std::setw(5) << node->ne[2] << ", "
             << std::setw(5) << node->ne[3] << "] "
             << std::left << std::setw(20) << ggml_op_name(node->op) << std::right << " "
             << std::left << std::setw(45) << node->name << std::right
             << std::setw(2) << "[ "
             << std::setw(0) << node->nb[0] << ", "
             << std::setw(5) << node->nb[1] << ", "
             << std::setw(5) << node->nb[2] << ", "
             << std::setw(5) << node->nb[3] << "] "
             << std::right << std::setw(15) << buf_name << std::right
             << "\n";

        for (int i = 0; i < GGML_MAX_SRC; i++) {
            if (auto* src = node->src[i]) {
                // Get buffer type name for source
                const char * src_buf_name = "none";
                ggml_backend_buffer_t src_buf = src->view_src ? src->view_src->buffer : src->buffer;
                if (src_buf) {
                    src_buf_name = ggml_backend_buffer_name(src_buf);
                }

                file << std::setw(10) << " [ "
                << std::setw(5) << src->ne[0] << ", "
                << std::setw(5) << src->ne[1] << ", "
                << std::setw(5) << src->ne[2] << ", "
                << std::setw(5) << src->ne[3] << "] "
                << std::setw(12)
                << i << ": " << std::left << std::setw(12) << ggml_op_name(src->op) << std::right;
                file << std::left << std::setw(30) << src->name << std::right
                << std::setw(16) << "[ "
                << std::setw(0) << src->nb[0] << ", "
                << std::setw(5) << src->nb[1] << ", "
                << std::setw(5) << src->nb[2] << ", "
                << std::setw(5) << src->nb[3] << "] "
                << std::right << std::setw(15) << src_buf_name << std::right
                << "\n";
            }
        }
    }

    file << "n_leafs = " << cgraph->n_leafs << "\n";
    for (int i = 0; i < cgraph->n_leafs; i++) {
        ggml_tensor * node = cgraph->leafs[i];

        // Get buffer type name for leaf
        const char * leaf_buf_name = "none";
        ggml_backend_buffer_t leaf_buf = node->view_src ? node->view_src->buffer : node->buffer;
        if (leaf_buf) {
            leaf_buf_name = ggml_backend_buffer_name(leaf_buf);
        }

        file << " - " << std::setw(3) << i << ": [ "
             << std::setw(5) << node->ne[0] << ", "
             << std::setw(5) << node->ne[1] << "] "
             << std::setw(8) << ggml_op_name(node->op) << " "
             << std::setw(16) << ggml_get_name(node)
             << std::setw(20) << leaf_buf_name << "\n";
    }
    // clang-format on
    file << "========================================\n";

    file.close();
}

void print_tensor_address_map(const ggml_cgraph * cgraph) {
    std::map<void *, std::vector<std::string>> address_map;
    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
        auto * node = cgraph->nodes[node_n];
        if (node->data) {
            auto it = address_map.find(node->data);
            if (it == address_map.end()) {
                address_map[node->data] = std::vector<std::string>();
            }
            address_map[node->data].push_back(node->name);
        }
    }
    for (const auto & pair : address_map) {
        std::cout << "Address: " << pair.first << std::endl;
        for (const auto & name : pair.second) {
            std::cout << name << " ; ";
        }
        std::cout << std::endl << std::endl;
    }
}

ov::Shape GgmlOvDecoder::get_shape(const ggml_tensor * tensor) {
    std::vector<size_t> shape;
    for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
        shape.push_back(static_cast<size_t>(tensor->ne[i]));
    }
    return shape;
}

std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor * tensor) {
    std::vector<size_t> stride;
    for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
        stride.push_back(static_cast<size_t>(tensor->nb[i]));
    }
    return stride;
}

ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor * tensor) {
    switch (tensor->type) {
    case GGML_TYPE_F64:
        return ov::element::f64;
    case GGML_TYPE_F32:
        return ov::element::f32;
    case GGML_TYPE_F16:
        return ov::element::f16;
    case GGML_TYPE_BF16:
        return ov::element::bf16;
    case GGML_TYPE_I8:
        return ov::element::i8;
    case GGML_TYPE_I16:
        return ov::element::i16;
    case GGML_TYPE_I32:
        return ov::element::i32;
    case GGML_TYPE_I64:
        return ov::element::i64;
    default:
        return ov::element::dynamic;
    }
}

ov::PartialShape GgmlOvDecoder::get_input_shape(int node_idx, const std::string & name) const {
    return ov::PartialShape(get_shape(m_node_info_list[node_idx].node_inputs.at(name)));
}

std::vector<size_t> GgmlOvDecoder::get_input_stride(int node_idx, const std::string & name) const {
    return get_stride(m_node_info_list[node_idx].node_inputs.at(name));
}

size_t GgmlOvDecoder::get_view_input_size(int node_idx, const std::string & name) const {
    auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
    if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
        return it->second.size();
    }
    return 0;
}

size_t GgmlOvDecoder::get_view_input_offset(int node_idx, const std::string & name, size_t view_index) const {
    auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
    if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
        if (view_index < it->second.size()) {
            return it->second[view_index].second->view_offs;
        }
    }
    return 0;
}

size_t GgmlOvDecoder::get_view_input_src_offset(int node_idx, const std::string & name, size_t view_index) const {
    auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
    if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
        if (view_index < it->second.size()) {
            auto * view_tensor = it->second[view_index].second;
            if (view_tensor && view_tensor->src[0]) {
                return view_tensor->src[0]->view_offs;
            }
        }
    }
    return 0;
}

std::vector<size_t> GgmlOvDecoder::get_view_input_stride(int node_idx,
                                                         const std::string & name,
                                                         size_t view_index) const {
    auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
    if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
        if (view_index < it->second.size()) {
            return get_stride(it->second[view_index].second);
        }
    }
    return {};
}

std::vector<size_t> GgmlOvDecoder::get_view_input_src_stride(int node_idx,
                                                             const std::string & name,
                                                             size_t view_index) const {
    auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
    if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
        if (view_index < it->second.size()) {
            auto * view_tensor = it->second[view_index].second;
            if (view_tensor && view_tensor->src[0]) {
                return get_stride(view_tensor->src[0]);
            }
        }
    }
    return {};
}

ov::Shape GgmlOvDecoder::get_view_input_ggml_shape(int node_idx, const std::string & name, size_t view_index) const {
    auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
    if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
        if (view_index < it->second.size()) {
            return get_shape(it->second[view_index].second);
        }
    }
    return {};
}

ov::Shape GgmlOvDecoder::get_view_input_src_ggml_shape(int node_idx,
                                                       const std::string & name,
                                                       size_t view_index) const {
    auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
    if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
        if (view_index < it->second.size()) {
            auto * view_tensor = it->second[view_index].second;
            if (view_tensor && view_tensor->src[0]) {
                return get_shape(view_tensor->src[0]);
            }
        }
    }
    return {};
}

ov::PartialShape GgmlOvDecoder::get_view_input_ov_shape(int node_idx,
                                                        const std::string & name,
                                                        size_t view_index) const {
    auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
    if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
        if (view_index < it->second.size()) {
            auto * tensor = it->second[view_index].second;
            ov::PartialShape shape = ov::PartialShape{get_shape(tensor)};

            // Check if this tensor has a dynamic dimension
            auto dynamic_it = m_node_dynamic_dims.find(tensor);
            if (dynamic_it != m_node_dynamic_dims.end() && dynamic_it->second != -1) {
                int dynamic_dim_index = dynamic_it->second;
                // GGML uses reverse indexing, so convert to OpenVINO indexing
                shape[3 - dynamic_dim_index] = m_is_static ? get_static_n_tokens() : -1;
            }

            return shape;
        }
    }
    return {};
}

ov::PartialShape GgmlOvDecoder::get_view_input_src_ov_shape(int node_idx,
                                                            const std::string & name,
                                                            size_t view_index) const {
    auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
    if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
        if (view_index < it->second.size()) {
            auto * view_tensor = it->second[view_index].second;
            if (view_tensor && view_tensor->src[0]) {
                auto * src_tensor = view_tensor->src[0];
                ov::PartialShape shape = ov::PartialShape{get_shape(src_tensor)};

                // Check if this tensor has a dynamic dimension
                auto dynamic_it = m_node_dynamic_dims.find(src_tensor);
                if (dynamic_it != m_node_dynamic_dims.end() && dynamic_it->second != -1) {
                    int dynamic_dim_index = dynamic_it->second;
                    // GGML uses reverse indexing, so convert to OpenVINO indexing
                    shape[3 - dynamic_dim_index] = m_is_static ? get_static_n_tokens() : -1;
                }

                return shape;
            }
        }
    }
    return {};
}

std::string GgmlOvDecoder::get_view_input_name(int node_idx, const std::string & name, size_t view_index) const {
    auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
    if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
        if (view_index < it->second.size()) {
            return it->second[view_index].second->name;
        }
    }
    return "";
}

std::string GgmlOvDecoder::get_view_input_src_name(int node_idx, const std::string & name, size_t view_index) const {
    auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
    if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
        if (view_index < it->second.size()) {
            auto * view_tensor = it->second[view_index].second;
            if (view_tensor && view_tensor->src[0]) {
                return view_tensor->src[0]->name;
            }
        }
    }
    return "";
}

ov::element::Type GgmlOvDecoder::get_input_type(int node_idx, const std::string & name) const {
    return get_ov_type(m_node_info_list[node_idx].node_inputs.at(name));
}

size_t GgmlOvDecoder::get_input_size() const {
    return m_model_inputs.size();
}

size_t GgmlOvDecoder::get_input_size(int node_idx) const {
    return m_node_info_list[node_idx].node_inputs_names.size();
}

std::vector<std::string> GgmlOvDecoder::get_input_names(int node_idx) const {
    return m_node_info_list[node_idx].node_inputs_names;
}

ov::PartialShape GgmlOvDecoder::get_output_shape(int node_idx) const {
    auto * ggml_tensor = m_node_info_list[node_idx].node_output;
    return ov::PartialShape(get_shape(ggml_tensor));
}

ov::element::Type GgmlOvDecoder::get_output_type(const int node_idx) const {
    return get_ov_type(m_node_info_list[node_idx].node);
}

std::vector<size_t> GgmlOvDecoder::get_output_stride(int node_idx) const {
    auto * ggml_tensor = m_node_info_list[node_idx].node;
    return get_stride(ggml_tensor);
}

std::vector<std::string> GgmlOvDecoder::get_output_names(int node_idx) const {
    return {m_node_info_list[node_idx].node_output_name};
}

const std::string & GgmlOvDecoder::get_op_name() const {
    static const std::string unknown_name = "UNKNOWN_OP_NAME";
    return unknown_name;
}

int32_t GgmlOvDecoder::get_op_dynamic_dim(int node_idx) const {
    auto it = m_node_dynamic_dims.find(m_node_info_list[node_idx].node);
    if (it == m_node_dynamic_dims.end()) {
        return -1;
    }
    return it->second;
}

const std::string & GgmlOvDecoder::get_op_name(int node_idx) const {
    return m_node_info_list[node_idx].node_name;
}

int32_t * GgmlOvDecoder::get_input_op_params(int node_idx, const std::string & name) const {
    return m_node_info_list[node_idx].node_inputs.at(name)->op_params;
}

int32_t * GgmlOvDecoder::get_output_op_params(int node_idx) const {
    return m_node_info_list[node_idx].node->op_params;
}

size_t GgmlOvDecoder::get_output_op_offset(int node_idx) const {
    return m_node_info_list[node_idx].node->view_offs;
}

void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const {
    for (int node_idx = 0; node_idx < m_cgraph->n_nodes; node_idx++) {
        if (m_cgraph->nodes[node_idx]->op == GGML_OP_NONE) {
            continue;
        }
        node_visitor(std::make_shared<GgmlOvDecoder>(*this), node_idx);
    }
}

std::string GgmlOvDecoder::compute_op_type(const ggml_tensor * node) {
    static const std::map<ggml_op, std::string> ops = {
        {GGML_OP_NONE,            "GGML_OP_NONE"           },
        {GGML_OP_ACC,             "GGML_OP_ACC"            },
        {GGML_OP_ADD,             "GGML_OP_ADD"            },
        {GGML_OP_ADD1,            "GGML_OP_ADD1"           },
        {GGML_OP_ADD_ID,          "GGML_OP_ADD_ID"         },
        {GGML_OP_CONCAT,          "GGML_OP_CONCAT"         },
        {GGML_OP_CONT,            "GGML_OP_CONT"           },
        {GGML_OP_DIV,             "GGML_OP_DIV"            },
        {GGML_OP_DUP,             "GGML_OP_DUP"            },
        {GGML_OP_GET_ROWS,        "GGML_OP_GET_ROWS"       },
        {GGML_OP_MUL,             "GGML_OP_MUL"            },
        {GGML_OP_MUL_MAT,         "GGML_OP_MUL_MAT"        },
        {GGML_OP_MUL_MAT_ID,      "GGML_OP_MUL_MAT_ID"     },
        {GGML_OP_PERMUTE,         "GGML_OP_PERMUTE"        },
        {GGML_OP_RESHAPE,         "GGML_OP_RESHAPE"        },
        {GGML_OP_RMS_NORM,        "GGML_OP_RMS_NORM"       },
        {GGML_OP_NORM,            "GGML_OP_NORM"           },
        {GGML_OP_ROPE,            "GGML_OP_ROPE"           },
        {GGML_OP_SCALE,           "GGML_OP_SCALE"          },
        {GGML_OP_SOFT_MAX,        "GGML_OP_SOFT_MAX"       },
        {GGML_OP_SUM_ROWS,        "GGML_OP_SUM_ROWS"       },
        {GGML_OP_SUB,             "GGML_OP_SUB"            },
        {GGML_OP_TRANSPOSE,       "GGML_OP_TRANSPOSE"      },
        {GGML_OP_VIEW,            "GGML_OP_VIEW"           },
        {GGML_OP_SET_ROWS,        "GGML_OP_SET_ROWS"       },
        {GGML_OP_CPY,             "GGML_OP_CPY"            },
        {GGML_OP_FLASH_ATTN_EXT,  "GGML_OP_FLASH_ATTN_EXT" },
        {GGML_OP_L2_NORM,         "GGML_OP_L2_NORM"        },
        {GGML_OP_CLAMP,           "GGML_OP_CLAMP"          },
        {GGML_OP_PAD,             "GGML_OP_PAD"            },
        {GGML_OP_SSM_CONV,        "GGML_OP_SSM_CONV"       },
        {GGML_OP_GATED_DELTA_NET, "GGML_OP_GATED_DELTA_NET"},
        {GGML_OP_ARGSORT,         "GGML_OP_ARGSORT"        },
        {GGML_OP_REPEAT,          "GGML_OP_REPEAT"         },
        {GGML_OP_IM2COL,          "GGML_OP_IM2COL"         }
    };
    static const std::map<ggml_unary_op, std::string> unary_ops = {
        {GGML_UNARY_OP_ABS,         "GGML_UNARY_OP_ABS"        },
        {GGML_UNARY_OP_SGN,         "GGML_UNARY_OP_SGN"        },
        {GGML_UNARY_OP_NEG,         "GGML_UNARY_OP_NEG"        },
        {GGML_UNARY_OP_STEP,        "GGML_UNARY_OP_STEP"       },
        {GGML_UNARY_OP_TANH,        "GGML_UNARY_OP_TANH"       },
        {GGML_UNARY_OP_ELU,         "GGML_UNARY_OP_ELU"        },
        {GGML_UNARY_OP_RELU,        "GGML_UNARY_OP_RELU"       },
        {GGML_UNARY_OP_SIGMOID,     "GGML_UNARY_OP_SIGMOID"    },
        {GGML_UNARY_OP_GELU,        "GGML_UNARY_OP_GELU"       },
        {GGML_UNARY_OP_GELU_QUICK,  "GGML_UNARY_OP_GELU_QUICK" },
        {GGML_UNARY_OP_SILU,        "GGML_UNARY_OP_SILU"       },
        {GGML_UNARY_OP_SOFTPLUS,    "GGML_UNARY_OP_SOFTPLUS"   },
        {GGML_UNARY_OP_HARDSWISH,   "GGML_UNARY_OP_HARDSWISH"  },
        {GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"},
        {GGML_UNARY_OP_EXP,         "GGML_UNARY_OP_EXP"        },
        {GGML_UNARY_OP_COUNT,       "GGML_UNARY_OP_COUNT"      }
    };
    static const std::map<ggml_glu_op, std::string> glu_ops = {
        {GGML_GLU_OP_SWIGLU, "GGML_GLU_OP_SWIGLU"},
        {GGML_GLU_OP_GEGLU,  "GGML_GLU_OP_GEGLU" },
        {GGML_GLU_OP_REGLU,  "GGML_GLU_OP_REGLU" }
    };

    switch (node->op) {
    case GGML_OP_UNARY:
        return unary_ops.at(ggml_get_unary_op(node));
    case GGML_OP_GLU:
        return glu_ops.at(ggml_get_glu_op(node));
    default:
        return ops.at(node->op);
    }
    static const std::string unknown_op = "UNKNOWN_GGML_OP";
    return unknown_op;
}

const std::string & GgmlOvDecoder::get_op_type(int node_idx) const {
    return m_node_info_list[node_idx].node_op_type;
}

const std::string & GgmlOvDecoder::get_op_type() const {
    static const std::string unknown_op = "UNKNOWN_GGML_OP";
    return unknown_op;
}

void GgmlOvDecoder::compute_node_dynamic_dims() {
    auto visit_node = [&](auto && self, ggml_tensor * node) -> void {
        if (!node) {
            return;
        }

        if (node->op == GGML_OP_CPY) {
            m_node_dynamic_dims[node] = -1;
        }

        if (m_node_dynamic_dims.count(node)) {
            return;
        }
        for (int i = 0; i < GGML_MAX_SRC; i++) {
            ggml_tensor * src = node->src[i];
            if (src == nullptr) {
                continue;
            }
            struct ggml_tensor * root_src = nullptr;
            // if (src->org_src) {
            //     root_src = src->org_src;
            // }
            if (root_src) {
                if (is_inp_tok(root_src, node) || is_inp_pos(root_src, node) || is_output_idx(root_src, node)) {
                    m_node_dynamic_dims[root_src] = 0;
                    m_node_dynamic_dims[src] = m_node_dynamic_dims[root_src];
                    continue;
                }
                self(self, root_src);
                m_node_dynamic_dims[src] = m_node_dynamic_dims[root_src];
            } else {
                if (is_inp_tok(src, node) || is_inp_pos(src, node) || is_output_idx(src, node)) {
                    m_node_dynamic_dims[src] = 0;
                    continue;
                }
                if (node->op == GGML_OP_VIEW && src->op == GGML_OP_NONE && !is_stateful() && !m_model_is_splitted) {
                    m_node_dynamic_dims[src] = 1;
                    continue;
                }
                self(self, src);
            }
        }
        switch (node->op) {
        case GGML_OP_NONE:
            m_node_dynamic_dims[node] = -1;
            break;
        case GGML_OP_GET_ROWS:
            m_node_dynamic_dims[node] = -1;
            if (m_node_dynamic_dims[node->src[1]] != -1) {
                auto dynamic_dim_idx = m_node_dynamic_dims[node->src[1]];
                if (dynamic_dim_idx == 0) {
                    m_node_dynamic_dims[node] = 1;
                } else {
                    auto dynamic_dim_stride = node->src[1]->nb[dynamic_dim_idx] / ggml_type_size(node->src[1]->type) *
                                              ggml_type_size(node->src[0]->type);
                    for (int i = 0; i < GGML_MAX_DIMS; i++) {
                        if (dynamic_dim_stride == node->src[0]->nb[i]) {
                            m_node_dynamic_dims[node] = i;
                            break;
                        }
                    }
                }
                // OPENVINO_ASSERT(dynamic_dim_value == node->ne[m_node_dynamic_dims[node]],
                //                 "Dynamic dim value mismatch for node: " + std::string(node->name) +
                //                     " and its src[1]: " + std::string(node->src[1]->name));
            }
            break;
        case GGML_OP_MUL:
        case GGML_OP_MUL_MAT:
            m_node_dynamic_dims[node] = -1;
            if (m_node_dynamic_dims[node->src[0]] != -1) {
                m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
            }
            if (m_node_dynamic_dims[node->src[1]] != -1) {
                m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]];
            }
            break;
        case GGML_OP_PERMUTE:
            m_node_dynamic_dims[node] = -1;
            if (m_node_dynamic_dims[node->src[0]] != -1) {
                auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
                // auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx];
                for (int i = 0; i < GGML_MAX_DIMS; i++) {
                    if (node->op_params[i] == dynamic_dim_idx) {
                        m_node_dynamic_dims[node] = i;
                        break;
                    }
                }
                // OPENVINO_ASSERT(dynamic_dim_value == node->ne[m_node_dynamic_dims[node]],
                //                 "Dynamic dim value mismatch for node: " + std::string(node->name) +
                //                     " and its src[0]: " + std::string(node->src[0]->name));
            }
            break;
        case GGML_OP_VIEW: {
            // Use stride-based matching: the stride of a VIEW dimension directly
            // encodes which source dimension it indexes into, so it uniquely
            // identifies the dynamic dim even when two dims share the same size.
            m_node_dynamic_dims[node] = -1;
            if (m_node_dynamic_dims[node->src[0]] != -1) {
                if (node->src[0]->op == GGML_OP_NONE) {
                    m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
                    break;
                }
                auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
                auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx];
                auto dynamic_dim_stride =
                    node->src[0]->nb[dynamic_dim_idx] / ggml_type_size(node->src[0]->type) * ggml_type_size(node->type);
                for (int i = 0; i < GGML_MAX_DIMS; i++) {
                    if (node->nb[i] == dynamic_dim_stride) {
                        m_node_dynamic_dims[node] = i;
                        break;
                    }
                }
                if (m_node_dynamic_dims[node] != -1 && dynamic_dim_value != node->ne[m_node_dynamic_dims[node]]) {
                    m_node_dynamic_dims[node] = -1;
                    // std::cout << "Warning: Dynamic dim value mismatch for node: " << node->name
                    //           << " and its src[0]: " << node->src[0]->name << std::endl;
                }
            }
            break;
        }
        case GGML_OP_TRANSPOSE:
        case GGML_OP_RESHAPE: {
            // RESHAPE requires src[0] to be contiguous, so both src and result
            // have standard compact strides: nb[i] = type_size * prod(ne[0..i-1]).
            // Match src->nb[dynamic_dim] against result->nb[i] to find the output
            // dimension whose flat-memory boundary aligns with the source dynamic
            // boundary. This is unambiguous (result strides are strictly monotone)
            // and handles merged-lower-dim cases that ne-value matching misses.
            m_node_dynamic_dims[node] = -1;
            if (m_node_dynamic_dims[node->src[0]] != -1) {
                auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
                auto dynamic_dim_stride = node->src[0]->nb[dynamic_dim_idx];
                for (int i = 0; i < GGML_MAX_DIMS; i++) {
                    if (node->nb[i] == dynamic_dim_stride && node->ne[i] == node->src[0]->ne[dynamic_dim_idx]) {
                        m_node_dynamic_dims[node] = i;
                        break;
                    }
                }
                if (m_node_dynamic_dims[node] == -1) {
                    // std::cout << "Cannot determine dynamic dim for RESHAPE node: " << node->name << std::endl;
                }
            }
            break;
        }
        case GGML_OP_FLASH_ATTN_EXT: {
            // Output shape is hard-coded in ggml_flash_attn_ext as:
            //   ne = { v->ne[0], q->ne[2], q->ne[1], q->ne[3] }
            // i.e. output dim 0 <- v dim 0 (head_size, static)
            //      output dim 1 <- q dim 2 (n_heads,   static)
            //      output dim 2 <- q dim 1 (n_tokens,  potentially dynamic)
            //      output dim 3 <- q dim 3 (batch,     static)
            // Using the fixed q-dim -> output-dim mapping table.
            // q is src[0]; the mapping from q's dynamic dim to the output dim is:
            //   q dim 1 -> output dim 2
            //   q dim 2 -> output dim 1
            //   q dim 3 -> output dim 3
            //   q dim 0 -> output dim 0  (head_size axis, unlikely to be dynamic)
            constexpr int q_to_out[GGML_MAX_DIMS] = {0, 2, 1, 3};
            m_node_dynamic_dims[node] = -1;
            if (m_node_dynamic_dims[node->src[0]] != -1) {
                auto q_dynamic_dim = m_node_dynamic_dims[node->src[0]];
                m_node_dynamic_dims[node] = q_to_out[q_dynamic_dim];
            }
            break;
        }
        case GGML_OP_CONT:
            m_node_dynamic_dims[node] = -1;
            if (m_node_dynamic_dims[node->src[0]] != -1) {
                auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
                if (ggml_are_same_shape(node, node->src[0])) {
                    m_node_dynamic_dims[node] = dynamic_dim_idx;
                } else {
                    size_t src_logical_nb[GGML_MAX_DIMS];
                    src_logical_nb[0] = ggml_type_size(node->src[0]->type);
                    src_logical_nb[1] = src_logical_nb[0] * (node->src[0]->ne[0] / ggml_blck_size(node->src[0]->type));
                    for (int i = 2; i < GGML_MAX_DIMS; i++) {
                        src_logical_nb[i] = src_logical_nb[i - 1] * node->src[0]->ne[i - 1];
                    }

                    auto dynamic_dim_stride = src_logical_nb[dynamic_dim_idx] / ggml_type_size(node->src[0]->type) *
                                              ggml_type_size(node->type);
                    int matched_dim_count = 0;
                    for (int i = 0; i < GGML_MAX_DIMS; i++) {
                        if (node->nb[i] == dynamic_dim_stride && node->ne[i] == node->src[0]->ne[dynamic_dim_idx]) {
                            m_node_dynamic_dims[node] = i;
                            matched_dim_count++;
                        }
                    }
                    if (matched_dim_count != 1) {
                        m_node_dynamic_dims[node] = -1;
                        // std::cout << "Warning: Cannot determine dynamic dim for CONT node: " << node->name
                        //           << " and its src[0]: " << node->src[0]->name << std::endl;
                    }
                }
            }
            break;
        case GGML_OP_RMS_NORM:
        case GGML_OP_NORM:
        case GGML_OP_ADD:
        case GGML_OP_GLU:
        case GGML_OP_ROPE:
        case GGML_OP_SCALE:
        case GGML_OP_SOFT_MAX:
        case GGML_OP_ARGSORT:
        case GGML_OP_ADD_ID:
        case GGML_OP_UNARY:
            m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
            break;
        case GGML_OP_MUL_MAT_ID:
            m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]];
            break;
        case GGML_OP_CPY:
        case GGML_OP_SET_ROWS:
            m_node_dynamic_dims[node] = -1;
            break;
        case GGML_OP_IM2COL: {
            m_node_dynamic_dims[node] = -1;
            if (m_node_dynamic_dims[node->src[1]] != -1) {
                const bool is_2D = node->op_params[6] == 1;
                const int src_dyn = m_node_dynamic_dims[node->src[1]];
                if (is_2D) {
                    if (src_dyn == 0) {
                        m_node_dynamic_dims[node] = 1;  // IW -> OW
                    } else if (src_dyn == 1) {
                        m_node_dynamic_dims[node] = 2;  // IH -> OH
                    } else if (src_dyn == 3) {
                        m_node_dynamic_dims[node] = 3;  // N  -> N
                    }
                } else {
                    if (src_dyn == 0) {
                        m_node_dynamic_dims[node] = 1;  // IW -> OW
                    } else if (src_dyn == 2) {
                        m_node_dynamic_dims[node] = 2;  // N  -> N  (1D: b->ne[2] is the batch/channel dim)
                    }
                }
                if (m_node_dynamic_dims[node] != -1) {
                    OPENVINO_ASSERT(node->src[1]->ne[src_dyn] == node->ne[m_node_dynamic_dims[node]],
                                    "Dynamic dim value mismatch for IM2COL node: " + std::string(node->name) +
                                        " and its src[1]: " + std::string(node->src[1]->name));
                }
            }
            break;
        }
        default:
            // std::cout << "Doesn't handle node name: " << node->name << " op: " << ggml_op_name(node->op) << std::endl;
            break;
        }
    };

    for (int i = 0; i < m_cgraph->n_nodes; i++) {
        ggml_tensor * node = m_cgraph->nodes[i];
        visit_node(visit_node, node);
    }

    // print the nodes in m_cgraph name & shape with the dynamic dim (the dynamic dim is the dimension with -1 in m_node_dynamic_dims) for debugging
    if (0) {
        for (int i = 0; i < m_cgraph->n_nodes; i++) {
            ggml_tensor * node = m_cgraph->nodes[i];
            int dynamic_dim = m_node_dynamic_dims[node];
            std::cout << "[" << i << "] " << "node_name: " << node->name << " op: " << ggml_op_name(node->op)
                      << " shape: [";
            for (int j = 0; j < 4; j++) {
                if (j == dynamic_dim) {
                    std::cout << "*";
                } else {
                    std::cout << node->ne[j];
                }
                if (j < 3) {
                    std::cout << ", ";
                }
            }
            std::cout << "]" << std::endl;
            // print the src name & shape with the dynamic dim for debugging
            for (int j = 0; j < GGML_MAX_SRC; j++) {
                ggml_tensor * src = node->src[j];
                if (src == nullptr) {
                    continue;
                }
                int src_dynamic_dim = m_node_dynamic_dims[src];
                std::cout << "    [" << j << "] src_name: " << src->name << " [";
                for (int k = 0; k < 4; k++) {
                    if (k == src_dynamic_dim) {
                        std::cout << "*";
                    } else {
                        std::cout << src->ne[k];
                    }
                    if (k < 3) {
                        std::cout << ", ";
                    }
                }
                std::cout << "]" << std::endl;
            }
            std::cout << std::endl;
        }
    }
}