#include "ggml-decoder.h"
#include "ggml-impl.h"
#include "ggml-openvino-extra.h"
#include "ggml-openvino.h"
#include "ggml-quants.h"
#include "ggml.h"
#include "utils.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <map>
#include <memory>
#include <openvino/core/dimension.hpp>
#include <openvino/core/except.hpp>
#include <openvino/core/node.hpp>
#include <openvino/core/partial_shape.hpp>
#include <openvino/core/type/bfloat16.hpp>
#include <openvino/core/type/element_type.hpp>
#include <openvino/core/type/float16.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/parameter.hpp>
#include <openvino/runtime/tensor.hpp>
#include <ostream>
#include <set>
#include <stdexcept>
#include <string>
#include <vector>
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
ModelParams & model_params,
ComputeParams & compute_params,
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
bool is_static,
bool is_stateful,
bool model_is_splitted,
bool is_prefill,
int prefill_chunk_size) :
m_is_static(is_static),
m_is_stateful(is_stateful),
m_is_prefill(is_prefill),
m_naive(false),
m_prefill_chunk_size(prefill_chunk_size),
m_model_is_splitted(model_is_splitted),
m_cgraph(cgraph),
m_model_weights(model_weights),
m_model_params(model_params),
m_compute_params(compute_params) {
static bool printed_address_map = false;
if (!printed_address_map) {
if (ggml_openvino_getenv_int("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
printed_address_map = true;
print_tensor_address_map(cgraph);
}
}
validate_cgraph();
set_input_output();
compute_node_dynamic_dims();
compute_model_inputs();
compute_model_outputs();
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node);
}
add_extra_inputs();
}
void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) {
m_cgraph = cgraph;
m_model_inputs.clear();
m_model_outputs.clear();
m_node_info_list.clear();
set_input_output();
compute_model_inputs();
compute_model_outputs();
}
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
m_cgraph = cgraph;
m_model_weights = model_weights;
m_naive = true;
set_input_output();
compute_model_inputs();
compute_model_outputs();
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node);
}
}
void GgmlOvDecoder::set_input_output() {
for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
auto node = m_cgraph->nodes[node_n];
NodeInfo current_node_info;
auto node_name = std::string(node->name);
auto node_output_name = node_name;
auto * node_output = node;
if (node->op == GGML_OP_SET_ROWS) {
node_output_name = std::string(node->view_src->name);
node_output = node->view_src;
}
current_node_info.node = node;
current_node_info.node_name = node_name;
current_node_info.node_output = node_output;
current_node_info.node_output_name = node_output_name;
current_node_info.node_op_case = 0;
current_node_info.data_addr = node->data;
for (int i = 0; i < GGML_MAX_SRC; i++) {
auto * src = node->src[i];
if (src == nullptr) {
continue;
}
auto src_name = std::string(src->name);
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
src_name = get_graph_input_ov_name(src, node);
}
current_node_info.node_inputs[src_name] = src;
current_node_info.node_inputs_names.push_back(src_name);
if (src->op == GGML_OP_VIEW) {
std::remove_reference_t<decltype(current_node_info.node_inputs_views[src_name])> view_chain;
auto current = src;
while (current != nullptr) {
auto current_name = std::string(current->name);
if (current->flags & GGML_TENSOR_FLAG_INPUT) {
current_name = get_graph_input_ov_name(current, node);
}
view_chain.emplace_back(current_name, current);
if (current->src[0] != nullptr && current->src[0]->op == GGML_OP_VIEW) {
current = current->src[0];
} else {
break;
}
}
current_node_info.node_inputs_views[src_name] = view_chain;
}
}
m_node_info_list.push_back(current_node_info);
}
}
int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
int op_case = 0;
switch (node->op) {
case GGML_OP_RESHAPE: {
auto * src = node->src[0];
if (src->op == GGML_OP_RESHAPE && src->src[0]->ne[0] == node->ne[0] && src->src[0]->ne[1] == node->ne[1]) {
op_case = 4;
} else if (node->ne[0] * node->ne[1] == src->ne[0]) {
op_case = 1;
} else if (src->ne[0] * src->ne[1] == node->ne[0]) {
op_case = 2;
if (src->ne[2] * src->ne[3] == node->ne[1]) {
op_case = 5;
}
} else if (src->ne[0] * src->ne[1] * src->ne[2] == node->ne[1]) {
op_case = 3;
} else if (src->ne[1] * src->ne[2] == node->ne[1]) {
op_case = 6;
}
if (op_case == 0 && ggml_nelements(node) == ggml_nelements(src)) {
op_case = 6;
}
break;
}
case GGML_OP_PERMUTE: {
if (node->src[0]->op != GGML_OP_VIEW) {
op_case = 1;
} else if (node->src[0]->src[0]->op == GGML_OP_NONE) {
std::string src_name(node->view_src->name);
int layer = extract_layer_from_name(src_name).value();
if (ggml_is_contiguous(node->src[0])) {
if (!is_swa_layer(layer)) {
op_case = 3;
} else {
op_case = 4;
}
} else {
if (!is_swa_layer(layer)) {
op_case = 5;
} else {
op_case = 6;
}
}
} else {
op_case = 2;
}
break;
}
case GGML_OP_MUL_MAT: {
if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) {
op_case = 3;
} else if (node->src[1]->op == GGML_OP_SOFT_MAX) {
op_case = 2;
}
break;
}
case GGML_OP_GET_ROWS: {
if (node->src[1]->op == GGML_OP_VIEW) {
op_case = 2;
}
break;
}
case GGML_OP_ROPE: {
const int mode = node->op_params[2];
switch (mode) {
case GGML_ROPE_TYPE_NEOX: {
op_case = 1;
break;
}
case GGML_ROPE_TYPE_IMROPE: {
op_case = 2;
break;
}
default:
op_case = 0;
break;
}
break;
}
case GGML_OP_VIEW: {
if (node->src[0]->op == GGML_OP_VIEW) {
auto * src = node->src[0];
if (ggml_nelements(node) != ggml_nelements(src)) {
}
op_case = 0;
if (m_model_is_splitted && m_model_inputs.find(std::string(src->name)) != m_model_inputs.end()) {
op_case = 0;
}
}
{
auto * src = node->src[0];
if (ggml_nelements(node) != ggml_nelements(src)) {
if (node->ne[0] == src->ne[0] && node->ne[1] == src->ne[2] && node->ne[2] == 1 &&
node->nb[0] == src->nb[0] && node->nb[1] == src->nb[2] && src->ne[1] > 1) {
op_case = 0;
break;
}
int diff_count = 0;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (node->ne[i] != src->ne[i]) {
diff_count++;
}
if (node->ne[i] > src->ne[i]) {
return 0;
}
}
if (diff_count >= 1) {
op_case = 0;
}
}
}
break;
}
default:
break;
}
return op_case;
}
std::optional<int> extract_layer_from_name(const std::string & name) {
size_t pos1 = name.find("_l");
if (pos1 == std::string::npos) {
return std::nullopt;
}
pos1 += 2;
size_t pos2 = name.find(' ', pos1);
if (pos2 == std::string::npos) {
pos2 = name.length();
}
std::string layer_str = name.substr(pos1, pos2 - pos1);
int layer = std::stoi(layer_str);
return layer;
}
std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgraph * cgraph, bool is_static) {
ModelParams model_params;
ComputeParams compute_params;
auto get_attention_pattern_case = [](const ggml_tensor * node) -> int {
if (node == nullptr) {
return -1;
}
switch (node->op) {
case GGML_OP_FLASH_ATTN_EXT:
if (node->src[0] == nullptr || node->src[1] == nullptr || node->src[3] == nullptr) {
return -1;
}
switch (node->src[1]->op) {
case GGML_OP_PERMUTE:
if (node->src[1]->src[0] != nullptr && node->src[1]->src[0]->op == GGML_OP_VIEW) {
return 0;
}
break;
case GGML_OP_CPY:
if (node->src[1]->src[0] != nullptr && node->src[1]->src[0]->op == GGML_OP_PERMUTE &&
node->src[1]->src[0]->src[0] != nullptr && node->src[1]->src[0]->src[0]->op == GGML_OP_VIEW) {
return 1;
}
break;
default:
break;
}
break;
case GGML_OP_SOFT_MAX:
if (node->src[0] != nullptr && node->src[1] != nullptr && node->src[0]->op == GGML_OP_MUL_MAT &&
node->src[0]->src[0] != nullptr && node->src[0]->src[1] != nullptr &&
node->src[0]->src[0]->op == GGML_OP_PERMUTE && node->src[0]->src[0]->src[0] != nullptr &&
node->src[0]->src[0]->src[0]->op == GGML_OP_VIEW) {
return 2;
}
if (node->src[0]->op == GGML_OP_ADD && node->src[0]->src[0] != nullptr &&
node->src[0]->src[0]->op == GGML_OP_MUL_MAT && node->src[0]->src[0]->src[0] != nullptr &&
node->src[0]->src[0]->src[0]->op == GGML_OP_PERMUTE) {
return 3;
}
break;
default:
break;
}
return -1;
};
bool rope_seen = false;
for (int i = 0; i < cgraph->n_nodes; i++) {
auto * node = cgraph->nodes[i];
std::string name = std::string(node->name);
const int attention_pattern_case = get_attention_pattern_case(node);
if (attention_pattern_case != -1) {
ggml_tensor * cache_k_permute = nullptr;
ggml_tensor * mask = nullptr;
switch (attention_pattern_case) {
case 0:
cache_k_permute = node->src[1];
mask = node->src[3];
break;
case 1:
cache_k_permute = node->src[1]->src[0];
mask = node->src[3];
break;
case 2:
cache_k_permute = node->src[0]->src[0];
mask = node->src[1];
break;
case 3:
cache_k_permute = node->src[0]->src[0]->src[0];
mask = node->src[1];
break;
default:
break;
}
assert(cache_k_permute != nullptr);
model_params.head_size = cache_k_permute->ne[0];
model_params.n_heads_kv = cache_k_permute->ne[2];
compute_params.input_len = node->src[0]->ne[1];
compute_params.token_len_per_seq = node->src[0]->ne[1];
auto * cache_k_view = cache_k_permute->src[0];
if (cache_k_view->op != GGML_OP_VIEW || mask == nullptr) {
continue;
}
ggml_tensor * cache_k = cache_k_view->src[0];
int layer = extract_layer_from_name(cache_k->name).value();
std::string mask_name(mask->name);
model_params.kv_buffer_ctx_id = ggml_backend_openvino_buffer_get_ctx_id(cache_k->buffer);
if (mask_name.find("swa") != std::string::npos) {
model_params.swa_layers.push_back(layer);
model_params.ctx_per_seq_swa = cache_k->ne[1];
} else {
model_params.ctx_per_seq = cache_k->ne[1];
model_params.n_seq = cache_k->ne[2];
}
compute_params.n_seq_active = mask->ne[3];
auto seq_size = cache_k->ne[0] * cache_k->ne[1] * ggml_type_size(cache_k->type);
size_t offset;
memcpy(&offset, cache_k_view->op_params, sizeof(size_t));
compute_params.seq_active_start = offset / seq_size;
if (mask_name.find("swa") != std::string::npos) {
compute_params.attention_size_swa = mask->ne[0];
} else {
compute_params.attention_size = mask->ne[0];
}
if (is_static) {
compute_params.attention_size = model_params.ctx_per_seq;
compute_params.attention_size_swa = model_params.ctx_per_seq_swa;
compute_params.token_len_per_seq = 1;
}
}
if (node->op == GGML_OP_MUL_MAT && node->src[0]->op == GGML_OP_PERMUTE &&
node->src[0]->src[0]->op == GGML_OP_VIEW && is_kvcache(node->src[0]->view_src, node->view_src)) {
if (node->src[1]->op == GGML_OP_PERMUTE && node->src[1]->src[0]->op == GGML_OP_VIEW &&
node->src[1]->src[0]->src[0]->op == GGML_OP_ROPE) {
compute_params.attention_size = node->ne[0];
}
}
if (node->op == GGML_OP_TRANSPOSE && node->src[0]->op == GGML_OP_PERMUTE &&
node->src[0]->src[0]->op == GGML_OP_VIEW) {
compute_params.attention_size = node->ne[0];
if (is_static) {
compute_params.attention_size = model_params.ctx_per_seq;
}
}
if (node->op == GGML_OP_ROPE) {
if (compute_params.token_len_per_seq == -1 && node->src[1] != nullptr) {
compute_params.token_len_per_seq = ggml_nelements(node->src[1]);
}
static_assert(sizeof(model_params.rope_params) == sizeof(int32_t) * 15, "rope_params size");
if (!rope_seen) {
memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15);
rope_seen = true;
} else if (memcmp(model_params.rope_params, node->op_params, sizeof(int32_t) * 15) != 0) {
model_params.mixed_rope_params = true;
}
}
}
auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1];
compute_params.output_len = output_tensor->ne[1];
if (is_static && compute_params.output_len == 0) {
compute_params.output_len = 1;
}
model_params.ctx = model_params.ctx_per_seq * model_params.n_seq;
return {model_params, compute_params};
}
void GgmlOvDecoder::validate_cgraph() const {
if (m_model_params.n_seq > 1 && m_is_static == true) {
throw std::runtime_error("n_seq > 1 is not supported on NPU. Try setting -np 1.");
}
}
ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op,
const ggml_tensor * input,
int dynamic_dim_index) const {
if (m_naive) {
return input != nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)};
}
auto name = std::string(input->name);
ov::PartialShape input_shape;
if (is_inp_tok(input, op) || is_inp_pos(input, op)) {
int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
input_shape = ov::PartialShape{1, 1, 1, len};
} else if (is_output_idx(input, op)) {
input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1};
} else if (is_inp_mask(input, op)) {
if (m_is_static) {
input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx};
} else if (m_is_stateful) {
input_shape = ov::PartialShape{1, 1, -1, -1};
} else {
input_shape = ov::PartialShape{-1, 1, -1, -1};
}
} else if (is_kvcache(input, op)) {
input_shape = ov::PartialShape{get_shape(input)};
if (!m_is_static) {
input_shape[2] = -1;
}
if (is_stateful()) {
assert(input_shape.size() == 4 && input_shape[0] == 1 && input_shape[1] == 1 &&
input_shape[2].is_dynamic() &&
input_shape[3] == (m_model_params.n_heads_kv * m_model_params.head_size));
input_shape = {input_shape[0], ov::Dimension::dynamic(), m_model_params.n_heads_kv,
m_model_params.head_size};
}
} else if (is_kv_idx(input, op)) {
int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
input_shape = ov::PartialShape{1, 1, 1, len};
} else {
input_shape = ov::PartialShape{get_shape(input)};
}
if (dynamic_dim_index != -1 && m_model_is_splitted) {
input_shape[3 - dynamic_dim_index] = -1;
}
if (op->op == GGML_OP_SOFT_MAX && op->src[1] != nullptr && op->src[1]->op == GGML_OP_NONE &&
op->src[1]->flags & GGML_TENSOR_FLAG_INPUT && op->src[1] == input) {
input_shape[2] = -1;
input_shape[3] = -1;
}
return input_shape;
}
void GgmlOvDecoder::add_extra_inputs() {
auto create_1d_input = [this](const std::string & name, int64_t value) {
if (m_is_static) {
auto constant =
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{value});
constant->set_friendly_name(name);
m_model_extra_inputs[name] = constant;
} else {
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
param_node->set_friendly_name(name);
param_node->output(0).get_tensor().set_names({name});
m_model_extra_inputs[name] = param_node;
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
*tensor->data<int64_t>() = value;
m_model_extra_input_values[name] = tensor;
}
};
if (m_compute_params.attention_size != -1) {
create_1d_input("attention_size", m_compute_params.attention_size);
}
if (m_compute_params.attention_size_swa != -1) {
create_1d_input("attention_size_swa", m_compute_params.attention_size_swa);
}
create_1d_input("n_seq_active", m_compute_params.n_seq_active);
create_1d_input("seq_active_start", m_compute_params.seq_active_start);
create_1d_input("seq_active_end", m_compute_params.seq_active_start + m_compute_params.n_seq_active);
if (m_compute_params.token_len_per_seq != -1) {
create_1d_input("token_len_per_seq", m_compute_params.token_len_per_seq);
}
}
bool GgmlOvDecoder::node_is_used_as_src(const int node_idx) {
ggml_tensor * node = m_cgraph->nodes[node_idx];
for (int i = node_idx; i < m_cgraph->n_nodes; i++) {
ggml_tensor * other_node = m_cgraph->nodes[i];
for (int j = 0; j < GGML_MAX_SRC; j++) {
if (other_node->src[j] == node) {
return true;
}
}
}
return false;
}
void GgmlOvDecoder::compute_model_inputs() {
m_model_inputs.clear();
m_inputs.clear();
for (int i = 0; i < m_cgraph->n_nodes; i++) {
ggml_tensor * node = m_cgraph->nodes[i];
if (node->op == GGML_OP_NONE && node_is_used_as_src(i)) {
std::string node_name(node->name);
if (m_model_weights.find(node_name) == m_model_weights.end()) {
m_inputs[node_name] = node;
auto param_node = std::make_shared<ov::op::v0::Parameter>(
get_ov_type(node), get_graph_input_shape(node, nullptr, m_node_dynamic_dims[node]));
param_node->set_friendly_name(node_name);
param_node->output(0).get_tensor().set_names({node_name});
m_model_inputs[node_name] = param_node;
}
continue;
}
for (int i = 0; i < GGML_MAX_SRC; i++) {
auto * src = node->src[i];
if (src == nullptr) {
continue;
}
std::string src_name = std::string(src->name);
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
src_name = get_graph_input_ov_name(src, node);
}
if (m_model_weights.find(src_name) != m_model_weights.end()) {
continue;
}
bool is_intermediate_node = false;
for (const auto & node_info : m_node_info_list) {
if (node_info.node == src) {
is_intermediate_node = true;
break;
}
}
if (is_intermediate_node) {
continue;
}
if (m_model_inputs.find(src_name) != m_model_inputs.end()) {
continue;
}
m_inputs[src_name] = src;
ggml_backend_buffer * buffer = src->buffer;
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
if (auto it = std::find(m_model_params.kv_names.begin(), m_model_params.kv_names.end(), src_name);
it == m_model_params.kv_names.end()) {
m_model_params.kv_names.push_back(src_name);
}
}
while (src->op == GGML_OP_VIEW && src->src[0] != nullptr) {
src = src->src[0];
src_name = std::string(src->name);
}
m_inputs[src_name] = src;
ov::PartialShape param_shape = get_graph_input_shape(node, src, m_node_dynamic_dims[src]);
auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), param_shape);
param_node->set_friendly_name(src_name);
param_node->output(0).get_tensor().set_names({src_name});
m_model_inputs[src_name] = param_node;
}
}
}
void GgmlOvDecoder::compute_model_outputs() {
m_model_outputs.clear();
m_model_output_names.clear();
for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
auto * cur_node = m_cgraph->nodes[node_n];
if (cur_node->op == GGML_OP_NONE || cur_node->op == GGML_OP_VIEW || cur_node->op == GGML_OP_RESHAPE) {
continue;
}
auto cur_node_use_count = m_cgraph->use_counts[ggml_hash_find(&m_cgraph->visited_hash_set, cur_node)];
if (cur_node_use_count == 0) {
if (cur_node != nullptr && cur_node->op == GGML_OP_SET_ROWS) {
cur_node = cur_node->view_src;
}
} else {
int input_use_count = 0;
for (int i = 0; i < m_cgraph->n_nodes; i++) {
ggml_tensor * node = m_cgraph->nodes[i];
for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] != NULL && node->src[j] == cur_node) {
input_use_count++;
}
}
}
if (input_use_count == cur_node_use_count) {
cur_node = nullptr;
}
}
if (cur_node != nullptr) {
std::string node_output_name(cur_node->name);
m_model_outputs[node_output_name] = cur_node;
m_model_output_names.push_back(node_output_name);
}
}
}
const ggml_tensor * GgmlOvDecoder::get_tensor_used_op(const ggml_tensor * tensor) const {
if (tensor == nullptr) {
return nullptr;
}
for (int i = 0; i < m_cgraph->n_nodes; i++) {
const auto * node = m_cgraph->nodes[i];
for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] == tensor) {
return node;
}
}
}
return nullptr;
}
const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name) const {
for (int i = 0; i < m_cgraph->n_nodes; i++) {
const auto * node = m_cgraph->nodes[i];
for (int j = 0; j < GGML_MAX_SRC; j++) {
const auto * src = node->src[j];
if (src == nullptr) {
break;
}
if (std::string(src->name) == name) {
return src;
}
}
}
return nullptr;
}
std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const {
std::map<std::string, std::string> kv_param_res_names;
for (const auto & name : m_model_params.kv_names) {
kv_param_res_names[name] = name;
}
return kv_param_res_names;
}
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
auto * nodes = cgraph->nodes;
auto n_nodes = cgraph->n_nodes;
for (int node_i = 0; node_i < n_nodes; node_i++) {
auto * node = nodes[node_i];
for (int i = 0; i < GGML_MAX_SRC; i++) {
auto * src = node->src[i];
if (src == nullptr) {
continue;
}
std::string src_name(src->name);
if (is_rope_freqs_weight(src, node)) {
src_name = "rope_freqs.weight";
}
if (!src->view_src) {
ggml_backend_buffer * buffer = src->buffer;
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) {
if (model_weights.find(src_name) == model_weights.end()) {
auto weight_node = create_weight_node(src, naive);
weight_node->set_friendly_name(src_name);
model_weights[src_name] = weight_node;
}
}
}
}
}
return model_weights;
}
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, bool naive) {
const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer);
if (tensor->extra) {
OPENVINO_ASSERT(is_ov_buffer, "Unsupported weight tensor: " + std::string(tensor->name) +
" Possibly this is a cpu backend repacked quantized weights");
auto * extra_base = static_cast<ggml_openvino_extra_base *>(tensor->extra);
if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT) {
auto * weight_extra = static_cast<ggml_openvino_weight_extra *>(tensor->extra);
if (weight_extra->weight_node) {
return weight_extra->weight_node;
}
} else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) {
auto * quant_extra = static_cast<ggml_openvino_quantized_weight_extra *>(tensor->extra);
if (quant_extra->weight_node) {
return quant_extra->weight_node;
}
}
}
if (!ggml_is_quantized(tensor->type) && (tensor->ne[2] > 1 || tensor->ne[3] > 1)) {
auto weight_tensor = ov::Tensor(get_ov_type(tensor), get_shape(tensor), tensor->data);
auto weight_node = std::make_shared<ov::op::v0::Constant>(weight_tensor);
weight_node->set_friendly_name(tensor->name);
return weight_node;
}
static const std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0,
GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1, GGML_TYPE_Q4_K,
GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
if (weight_types.find(tensor->type) == weight_types.end()) {
throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " +
ggml_type_name(tensor->type));
}
OvWeight ov_weight;
if (ggml_is_quantized(tensor->type)) {
auto use_bias = naive;
if (is_ov_buffer) {
size_t raw_size = ggml_nbytes(tensor);
std::vector<uint8_t> tmp(raw_size);
memcpy(tmp.data(), tensor->data, raw_size);
ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data, use_bias);
} else {
ov_weight = process_weight_tensor(tensor, tensor->data, nullptr, use_bias);
}
} else {
ov_weight = process_weight_tensor(tensor, tensor->data, tensor->data);
}
ov_weight.weight_node->set_friendly_name(tensor->name);
if (!is_ov_buffer) {
return ov_weight.weight_node;
}
ggml_openvino_extra_base * extra;
if (ov_weight.is_quantized()) {
extra = new ggml_openvino_quantized_weight_extra(std::move(ov_weight.weights), std::move(ov_weight.scales),
std::move(ov_weight.zp), ov_weight.weight_node);
} else {
extra = new ggml_openvino_weight_extra(std::move(ov_weight.weights), ov_weight.weight_node);
}
ggml_openvino_buffer_register_extra(tensor, extra);
return ov_weight.weight_node;
}
void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) {
std::ofstream file(filename);
if (!file.is_open()) {
std::cerr << "Failed to open file" << std::endl;
return;
}
file << "=== GRAPH ===\n";
file << "n_nodes = " << cgraph->n_nodes << "\n";
file << " " << std::setw(3) << "nodes"
<< std::setw(15) << "shape"
<< std::setw(20) << "op"
<< std::setw(20) << "name"
<< std::setw(3) << " "
<< std::setw(62) << "stride"
<< std::setw(20) << "buffer_type"
<< "\n";
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
const char * buf_name = "none";
ggml_backend_buffer_t buf = node->view_src ? node->view_src->buffer : node->buffer;
if (buf) {
buf_name = ggml_backend_buffer_name(buf);
}
file << " - " << std::setw(3) << i << ": [ "
<< std::setw(5) << node->ne[0] << ", "
<< std::setw(5) << node->ne[1] << ", "
<< std::setw(5) << node->ne[2] << ", "
<< std::setw(5) << node->ne[3] << "] "
<< std::left << std::setw(20) << ggml_op_name(node->op) << std::right << " "
<< std::left << std::setw(45) << node->name << std::right
<< std::setw(2) << "[ "
<< std::setw(0) << node->nb[0] << ", "
<< std::setw(5) << node->nb[1] << ", "
<< std::setw(5) << node->nb[2] << ", "
<< std::setw(5) << node->nb[3] << "] "
<< std::right << std::setw(15) << buf_name << std::right
<< "\n";
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (auto* src = node->src[i]) {
const char * src_buf_name = "none";
ggml_backend_buffer_t src_buf = src->view_src ? src->view_src->buffer : src->buffer;
if (src_buf) {
src_buf_name = ggml_backend_buffer_name(src_buf);
}
file << std::setw(10) << " [ "
<< std::setw(5) << src->ne[0] << ", "
<< std::setw(5) << src->ne[1] << ", "
<< std::setw(5) << src->ne[2] << ", "
<< std::setw(5) << src->ne[3] << "] "
<< std::setw(12)
<< i << ": " << std::left << std::setw(12) << ggml_op_name(src->op) << std::right;
file << std::left << std::setw(30) << src->name << std::right
<< std::setw(16) << "[ "
<< std::setw(0) << src->nb[0] << ", "
<< std::setw(5) << src->nb[1] << ", "
<< std::setw(5) << src->nb[2] << ", "
<< std::setw(5) << src->nb[3] << "] "
<< std::right << std::setw(15) << src_buf_name << std::right
<< "\n";
}
}
}
file << "n_leafs = " << cgraph->n_leafs << "\n";
for (int i = 0; i < cgraph->n_leafs; i++) {
ggml_tensor * node = cgraph->leafs[i];
const char * leaf_buf_name = "none";
ggml_backend_buffer_t leaf_buf = node->view_src ? node->view_src->buffer : node->buffer;
if (leaf_buf) {
leaf_buf_name = ggml_backend_buffer_name(leaf_buf);
}
file << " - " << std::setw(3) << i << ": [ "
<< std::setw(5) << node->ne[0] << ", "
<< std::setw(5) << node->ne[1] << "] "
<< std::setw(8) << ggml_op_name(node->op) << " "
<< std::setw(16) << ggml_get_name(node)
<< std::setw(20) << leaf_buf_name << "\n";
}
file << "========================================\n";
file.close();
}
void print_tensor_address_map(const ggml_cgraph * cgraph) {
std::map<void *, std::vector<std::string>> address_map;
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
auto * node = cgraph->nodes[node_n];
if (node->data) {
auto it = address_map.find(node->data);
if (it == address_map.end()) {
address_map[node->data] = std::vector<std::string>();
}
address_map[node->data].push_back(node->name);
}
}
for (const auto & pair : address_map) {
std::cout << "Address: " << pair.first << std::endl;
for (const auto & name : pair.second) {
std::cout << name << " ; ";
}
std::cout << std::endl << std::endl;
}
}
ov::Shape GgmlOvDecoder::get_shape(const ggml_tensor * tensor) {
std::vector<size_t> shape;
for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
shape.push_back(static_cast<size_t>(tensor->ne[i]));
}
return shape;
}
std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor * tensor) {
std::vector<size_t> stride;
for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
stride.push_back(static_cast<size_t>(tensor->nb[i]));
}
return stride;
}
ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor * tensor) {
switch (tensor->type) {
case GGML_TYPE_F64:
return ov::element::f64;
case GGML_TYPE_F32:
return ov::element::f32;
case GGML_TYPE_F16:
return ov::element::f16;
case GGML_TYPE_BF16:
return ov::element::bf16;
case GGML_TYPE_I8:
return ov::element::i8;
case GGML_TYPE_I16:
return ov::element::i16;
case GGML_TYPE_I32:
return ov::element::i32;
case GGML_TYPE_I64:
return ov::element::i64;
default:
return ov::element::dynamic;
}
}
ov::PartialShape GgmlOvDecoder::get_input_shape(int node_idx, const std::string & name) const {
return ov::PartialShape(get_shape(m_node_info_list[node_idx].node_inputs.at(name)));
}
std::vector<size_t> GgmlOvDecoder::get_input_stride(int node_idx, const std::string & name) const {
return get_stride(m_node_info_list[node_idx].node_inputs.at(name));
}
size_t GgmlOvDecoder::get_view_input_size(int node_idx, const std::string & name) const {
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
return it->second.size();
}
return 0;
}
size_t GgmlOvDecoder::get_view_input_offset(int node_idx, const std::string & name, size_t view_index) const {
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
if (view_index < it->second.size()) {
return it->second[view_index].second->view_offs;
}
}
return 0;
}
size_t GgmlOvDecoder::get_view_input_src_offset(int node_idx, const std::string & name, size_t view_index) const {
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
if (view_index < it->second.size()) {
auto * view_tensor = it->second[view_index].second;
if (view_tensor && view_tensor->src[0]) {
return view_tensor->src[0]->view_offs;
}
}
}
return 0;
}
std::vector<size_t> GgmlOvDecoder::get_view_input_stride(int node_idx,
const std::string & name,
size_t view_index) const {
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
if (view_index < it->second.size()) {
return get_stride(it->second[view_index].second);
}
}
return {};
}
std::vector<size_t> GgmlOvDecoder::get_view_input_src_stride(int node_idx,
const std::string & name,
size_t view_index) const {
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
if (view_index < it->second.size()) {
auto * view_tensor = it->second[view_index].second;
if (view_tensor && view_tensor->src[0]) {
return get_stride(view_tensor->src[0]);
}
}
}
return {};
}
ov::Shape GgmlOvDecoder::get_view_input_ggml_shape(int node_idx, const std::string & name, size_t view_index) const {
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
if (view_index < it->second.size()) {
return get_shape(it->second[view_index].second);
}
}
return {};
}
ov::Shape GgmlOvDecoder::get_view_input_src_ggml_shape(int node_idx,
const std::string & name,
size_t view_index) const {
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
if (view_index < it->second.size()) {
auto * view_tensor = it->second[view_index].second;
if (view_tensor && view_tensor->src[0]) {
return get_shape(view_tensor->src[0]);
}
}
}
return {};
}
ov::PartialShape GgmlOvDecoder::get_view_input_ov_shape(int node_idx,
const std::string & name,
size_t view_index) const {
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
if (view_index < it->second.size()) {
auto * tensor = it->second[view_index].second;
ov::PartialShape shape = ov::PartialShape{get_shape(tensor)};
auto dynamic_it = m_node_dynamic_dims.find(tensor);
if (dynamic_it != m_node_dynamic_dims.end() && dynamic_it->second != -1) {
int dynamic_dim_index = dynamic_it->second;
shape[3 - dynamic_dim_index] = m_is_static ? get_static_n_tokens() : -1;
}
return shape;
}
}
return {};
}
ov::PartialShape GgmlOvDecoder::get_view_input_src_ov_shape(int node_idx,
const std::string & name,
size_t view_index) const {
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
if (view_index < it->second.size()) {
auto * view_tensor = it->second[view_index].second;
if (view_tensor && view_tensor->src[0]) {
auto * src_tensor = view_tensor->src[0];
ov::PartialShape shape = ov::PartialShape{get_shape(src_tensor)};
auto dynamic_it = m_node_dynamic_dims.find(src_tensor);
if (dynamic_it != m_node_dynamic_dims.end() && dynamic_it->second != -1) {
int dynamic_dim_index = dynamic_it->second;
shape[3 - dynamic_dim_index] = m_is_static ? get_static_n_tokens() : -1;
}
return shape;
}
}
}
return {};
}
std::string GgmlOvDecoder::get_view_input_name(int node_idx, const std::string & name, size_t view_index) const {
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
if (view_index < it->second.size()) {
return it->second[view_index].second->name;
}
}
return "";
}
std::string GgmlOvDecoder::get_view_input_src_name(int node_idx, const std::string & name, size_t view_index) const {
auto it = m_node_info_list[node_idx].node_inputs_views.find(name);
if (it != m_node_info_list[node_idx].node_inputs_views.end()) {
if (view_index < it->second.size()) {
auto * view_tensor = it->second[view_index].second;
if (view_tensor && view_tensor->src[0]) {
return view_tensor->src[0]->name;
}
}
}
return "";
}
ov::element::Type GgmlOvDecoder::get_input_type(int node_idx, const std::string & name) const {
return get_ov_type(m_node_info_list[node_idx].node_inputs.at(name));
}
size_t GgmlOvDecoder::get_input_size() const {
return m_model_inputs.size();
}
size_t GgmlOvDecoder::get_input_size(int node_idx) const {
return m_node_info_list[node_idx].node_inputs_names.size();
}
std::vector<std::string> GgmlOvDecoder::get_input_names(int node_idx) const {
return m_node_info_list[node_idx].node_inputs_names;
}
ov::PartialShape GgmlOvDecoder::get_output_shape(int node_idx) const {
auto * ggml_tensor = m_node_info_list[node_idx].node_output;
return ov::PartialShape(get_shape(ggml_tensor));
}
ov::element::Type GgmlOvDecoder::get_output_type(const int node_idx) const {
return get_ov_type(m_node_info_list[node_idx].node);
}
std::vector<size_t> GgmlOvDecoder::get_output_stride(int node_idx) const {
auto * ggml_tensor = m_node_info_list[node_idx].node;
return get_stride(ggml_tensor);
}
std::vector<std::string> GgmlOvDecoder::get_output_names(int node_idx) const {
return {m_node_info_list[node_idx].node_output_name};
}
const std::string & GgmlOvDecoder::get_op_name() const {
static const std::string unknown_name = "UNKNOWN_OP_NAME";
return unknown_name;
}
int32_t GgmlOvDecoder::get_op_dynamic_dim(int node_idx) const {
auto it = m_node_dynamic_dims.find(m_node_info_list[node_idx].node);
if (it == m_node_dynamic_dims.end()) {
return -1;
}
return it->second;
}
const std::string & GgmlOvDecoder::get_op_name(int node_idx) const {
return m_node_info_list[node_idx].node_name;
}
int32_t * GgmlOvDecoder::get_input_op_params(int node_idx, const std::string & name) const {
return m_node_info_list[node_idx].node_inputs.at(name)->op_params;
}
int32_t * GgmlOvDecoder::get_output_op_params(int node_idx) const {
return m_node_info_list[node_idx].node->op_params;
}
size_t GgmlOvDecoder::get_output_op_offset(int node_idx) const {
return m_node_info_list[node_idx].node->view_offs;
}
void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const {
for (int node_idx = 0; node_idx < m_cgraph->n_nodes; node_idx++) {
if (m_cgraph->nodes[node_idx]->op == GGML_OP_NONE) {
continue;
}
node_visitor(std::make_shared<GgmlOvDecoder>(*this), node_idx);
}
}
std::string GgmlOvDecoder::compute_op_type(const ggml_tensor * node) {
static const std::map<ggml_op, std::string> ops = {
{GGML_OP_NONE, "GGML_OP_NONE" },
{GGML_OP_ACC, "GGML_OP_ACC" },
{GGML_OP_ADD, "GGML_OP_ADD" },
{GGML_OP_ADD1, "GGML_OP_ADD1" },
{GGML_OP_ADD_ID, "GGML_OP_ADD_ID" },
{GGML_OP_CONCAT, "GGML_OP_CONCAT" },
{GGML_OP_CONT, "GGML_OP_CONT" },
{GGML_OP_DIV, "GGML_OP_DIV" },
{GGML_OP_DUP, "GGML_OP_DUP" },
{GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" },
{GGML_OP_MUL, "GGML_OP_MUL" },
{GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" },
{GGML_OP_MUL_MAT_ID, "GGML_OP_MUL_MAT_ID" },
{GGML_OP_PERMUTE, "GGML_OP_PERMUTE" },
{GGML_OP_RESHAPE, "GGML_OP_RESHAPE" },
{GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" },
{GGML_OP_NORM, "GGML_OP_NORM" },
{GGML_OP_ROPE, "GGML_OP_ROPE" },
{GGML_OP_SCALE, "GGML_OP_SCALE" },
{GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" },
{GGML_OP_SUM_ROWS, "GGML_OP_SUM_ROWS" },
{GGML_OP_SUB, "GGML_OP_SUB" },
{GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE" },
{GGML_OP_VIEW, "GGML_OP_VIEW" },
{GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" },
{GGML_OP_CPY, "GGML_OP_CPY" },
{GGML_OP_FLASH_ATTN_EXT, "GGML_OP_FLASH_ATTN_EXT" },
{GGML_OP_L2_NORM, "GGML_OP_L2_NORM" },
{GGML_OP_CLAMP, "GGML_OP_CLAMP" },
{GGML_OP_PAD, "GGML_OP_PAD" },
{GGML_OP_SSM_CONV, "GGML_OP_SSM_CONV" },
{GGML_OP_GATED_DELTA_NET, "GGML_OP_GATED_DELTA_NET"},
{GGML_OP_ARGSORT, "GGML_OP_ARGSORT" },
{GGML_OP_REPEAT, "GGML_OP_REPEAT" },
{GGML_OP_IM2COL, "GGML_OP_IM2COL" }
};
static const std::map<ggml_unary_op, std::string> unary_ops = {
{GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" },
{GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN" },
{GGML_UNARY_OP_NEG, "GGML_UNARY_OP_NEG" },
{GGML_UNARY_OP_STEP, "GGML_UNARY_OP_STEP" },
{GGML_UNARY_OP_TANH, "GGML_UNARY_OP_TANH" },
{GGML_UNARY_OP_ELU, "GGML_UNARY_OP_ELU" },
{GGML_UNARY_OP_RELU, "GGML_UNARY_OP_RELU" },
{GGML_UNARY_OP_SIGMOID, "GGML_UNARY_OP_SIGMOID" },
{GGML_UNARY_OP_GELU, "GGML_UNARY_OP_GELU" },
{GGML_UNARY_OP_GELU_QUICK, "GGML_UNARY_OP_GELU_QUICK" },
{GGML_UNARY_OP_SILU, "GGML_UNARY_OP_SILU" },
{GGML_UNARY_OP_SOFTPLUS, "GGML_UNARY_OP_SOFTPLUS" },
{GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH" },
{GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"},
{GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP" },
{GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT" }
};
static const std::map<ggml_glu_op, std::string> glu_ops = {
{GGML_GLU_OP_SWIGLU, "GGML_GLU_OP_SWIGLU"},
{GGML_GLU_OP_GEGLU, "GGML_GLU_OP_GEGLU" },
{GGML_GLU_OP_REGLU, "GGML_GLU_OP_REGLU" }
};
switch (node->op) {
case GGML_OP_UNARY:
return unary_ops.at(ggml_get_unary_op(node));
case GGML_OP_GLU:
return glu_ops.at(ggml_get_glu_op(node));
default:
return ops.at(node->op);
}
static const std::string unknown_op = "UNKNOWN_GGML_OP";
return unknown_op;
}
const std::string & GgmlOvDecoder::get_op_type(int node_idx) const {
return m_node_info_list[node_idx].node_op_type;
}
const std::string & GgmlOvDecoder::get_op_type() const {
static const std::string unknown_op = "UNKNOWN_GGML_OP";
return unknown_op;
}
void GgmlOvDecoder::compute_node_dynamic_dims() {
auto visit_node = [&](auto && self, ggml_tensor * node) -> void {
if (!node) {
return;
}
if (node->op == GGML_OP_CPY) {
m_node_dynamic_dims[node] = -1;
}
if (m_node_dynamic_dims.count(node)) {
return;
}
for (int i = 0; i < GGML_MAX_SRC; i++) {
ggml_tensor * src = node->src[i];
if (src == nullptr) {
continue;
}
struct ggml_tensor * root_src = nullptr;
if (root_src) {
if (is_inp_tok(root_src, node) || is_inp_pos(root_src, node) || is_output_idx(root_src, node)) {
m_node_dynamic_dims[root_src] = 0;
m_node_dynamic_dims[src] = m_node_dynamic_dims[root_src];
continue;
}
self(self, root_src);
m_node_dynamic_dims[src] = m_node_dynamic_dims[root_src];
} else {
if (is_inp_tok(src, node) || is_inp_pos(src, node) || is_output_idx(src, node)) {
m_node_dynamic_dims[src] = 0;
continue;
}
if (node->op == GGML_OP_VIEW && src->op == GGML_OP_NONE && !is_stateful() && !m_model_is_splitted) {
m_node_dynamic_dims[src] = 1;
continue;
}
self(self, src);
}
}
switch (node->op) {
case GGML_OP_NONE:
m_node_dynamic_dims[node] = -1;
break;
case GGML_OP_GET_ROWS:
m_node_dynamic_dims[node] = -1;
if (m_node_dynamic_dims[node->src[1]] != -1) {
auto dynamic_dim_idx = m_node_dynamic_dims[node->src[1]];
if (dynamic_dim_idx == 0) {
m_node_dynamic_dims[node] = 1;
} else {
auto dynamic_dim_stride = node->src[1]->nb[dynamic_dim_idx] / ggml_type_size(node->src[1]->type) *
ggml_type_size(node->src[0]->type);
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (dynamic_dim_stride == node->src[0]->nb[i]) {
m_node_dynamic_dims[node] = i;
break;
}
}
}
}
break;
case GGML_OP_MUL:
case GGML_OP_MUL_MAT:
m_node_dynamic_dims[node] = -1;
if (m_node_dynamic_dims[node->src[0]] != -1) {
m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
}
if (m_node_dynamic_dims[node->src[1]] != -1) {
m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]];
}
break;
case GGML_OP_PERMUTE:
m_node_dynamic_dims[node] = -1;
if (m_node_dynamic_dims[node->src[0]] != -1) {
auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (node->op_params[i] == dynamic_dim_idx) {
m_node_dynamic_dims[node] = i;
break;
}
}
}
break;
case GGML_OP_VIEW: {
m_node_dynamic_dims[node] = -1;
if (m_node_dynamic_dims[node->src[0]] != -1) {
if (node->src[0]->op == GGML_OP_NONE) {
m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
break;
}
auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx];
auto dynamic_dim_stride =
node->src[0]->nb[dynamic_dim_idx] / ggml_type_size(node->src[0]->type) * ggml_type_size(node->type);
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (node->nb[i] == dynamic_dim_stride) {
m_node_dynamic_dims[node] = i;
break;
}
}
if (m_node_dynamic_dims[node] != -1 && dynamic_dim_value != node->ne[m_node_dynamic_dims[node]]) {
m_node_dynamic_dims[node] = -1;
}
}
break;
}
case GGML_OP_TRANSPOSE:
case GGML_OP_RESHAPE: {
m_node_dynamic_dims[node] = -1;
if (m_node_dynamic_dims[node->src[0]] != -1) {
auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
auto dynamic_dim_stride = node->src[0]->nb[dynamic_dim_idx];
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (node->nb[i] == dynamic_dim_stride && node->ne[i] == node->src[0]->ne[dynamic_dim_idx]) {
m_node_dynamic_dims[node] = i;
break;
}
}
if (m_node_dynamic_dims[node] == -1) {
}
}
break;
}
case GGML_OP_FLASH_ATTN_EXT: {
constexpr int q_to_out[GGML_MAX_DIMS] = {0, 2, 1, 3};
m_node_dynamic_dims[node] = -1;
if (m_node_dynamic_dims[node->src[0]] != -1) {
auto q_dynamic_dim = m_node_dynamic_dims[node->src[0]];
m_node_dynamic_dims[node] = q_to_out[q_dynamic_dim];
}
break;
}
case GGML_OP_CONT:
m_node_dynamic_dims[node] = -1;
if (m_node_dynamic_dims[node->src[0]] != -1) {
auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
if (ggml_are_same_shape(node, node->src[0])) {
m_node_dynamic_dims[node] = dynamic_dim_idx;
} else {
size_t src_logical_nb[GGML_MAX_DIMS];
src_logical_nb[0] = ggml_type_size(node->src[0]->type);
src_logical_nb[1] = src_logical_nb[0] * (node->src[0]->ne[0] / ggml_blck_size(node->src[0]->type));
for (int i = 2; i < GGML_MAX_DIMS; i++) {
src_logical_nb[i] = src_logical_nb[i - 1] * node->src[0]->ne[i - 1];
}
auto dynamic_dim_stride = src_logical_nb[dynamic_dim_idx] / ggml_type_size(node->src[0]->type) *
ggml_type_size(node->type);
int matched_dim_count = 0;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (node->nb[i] == dynamic_dim_stride && node->ne[i] == node->src[0]->ne[dynamic_dim_idx]) {
m_node_dynamic_dims[node] = i;
matched_dim_count++;
}
}
if (matched_dim_count != 1) {
m_node_dynamic_dims[node] = -1;
}
}
}
break;
case GGML_OP_RMS_NORM:
case GGML_OP_NORM:
case GGML_OP_ADD:
case GGML_OP_GLU:
case GGML_OP_ROPE:
case GGML_OP_SCALE:
case GGML_OP_SOFT_MAX:
case GGML_OP_ARGSORT:
case GGML_OP_ADD_ID:
case GGML_OP_UNARY:
m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
break;
case GGML_OP_MUL_MAT_ID:
m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]];
break;
case GGML_OP_CPY:
case GGML_OP_SET_ROWS:
m_node_dynamic_dims[node] = -1;
break;
case GGML_OP_IM2COL: {
m_node_dynamic_dims[node] = -1;
if (m_node_dynamic_dims[node->src[1]] != -1) {
const bool is_2D = node->op_params[6] == 1;
const int src_dyn = m_node_dynamic_dims[node->src[1]];
if (is_2D) {
if (src_dyn == 0) {
m_node_dynamic_dims[node] = 1; } else if (src_dyn == 1) {
m_node_dynamic_dims[node] = 2; } else if (src_dyn == 3) {
m_node_dynamic_dims[node] = 3; }
} else {
if (src_dyn == 0) {
m_node_dynamic_dims[node] = 1; } else if (src_dyn == 2) {
m_node_dynamic_dims[node] = 2; }
}
if (m_node_dynamic_dims[node] != -1) {
OPENVINO_ASSERT(node->src[1]->ne[src_dyn] == node->ne[m_node_dynamic_dims[node]],
"Dynamic dim value mismatch for IM2COL node: " + std::string(node->name) +
" and its src[1]: " + std::string(node->src[1]->name));
}
}
break;
}
default:
break;
}
};
for (int i = 0; i < m_cgraph->n_nodes; i++) {
ggml_tensor * node = m_cgraph->nodes[i];
visit_node(visit_node, node);
}
if (0) {
for (int i = 0; i < m_cgraph->n_nodes; i++) {
ggml_tensor * node = m_cgraph->nodes[i];
int dynamic_dim = m_node_dynamic_dims[node];
std::cout << "[" << i << "] " << "node_name: " << node->name << " op: " << ggml_op_name(node->op)
<< " shape: [";
for (int j = 0; j < 4; j++) {
if (j == dynamic_dim) {
std::cout << "*";
} else {
std::cout << node->ne[j];
}
if (j < 3) {
std::cout << ", ";
}
}
std::cout << "]" << std::endl;
for (int j = 0; j < GGML_MAX_SRC; j++) {
ggml_tensor * src = node->src[j];
if (src == nullptr) {
continue;
}
int src_dynamic_dim = m_node_dynamic_dims[src];
std::cout << " [" << j << "] src_name: " << src->name << " [";
for (int k = 0; k < 4; k++) {
if (k == src_dynamic_dim) {
std::cout << "*";
} else {
std::cout << src->ne[k];
}
if (k < 3) {
std::cout << ", ";
}
}
std::cout << "]" << std::endl;
}
std::cout << std::endl;
}
}
}