#include "models.h"
ggml_cgraph * clip_graph_deepseekocr2::build() {
GGML_ASSERT(hparams.n_head_kv > 0);
GGML_ASSERT(n_head % hparams.n_head_kv == 0);
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * sam_out = build_sam(inp_raw);
ggml_tensor * qwen2_out;
{
ggml_tensor * inp;
inp = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2]); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
auto num_image_tokens = inp->ne[1]; GGML_ASSERT(num_image_tokens == 144 || num_image_tokens == 256);
ggml_tensor * query_embed = model.resample_query_1024;
int num_queries = 256;
if (num_image_tokens == 144) {
query_embed = model.resample_query_768;
num_queries = 144;
}
inp = ggml_concat(ctx0, inp, ggml_cast(ctx0, query_embed, inp->type), 1);
auto seq_len = inp->ne[1];
ggml_tensor * attn_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, seq_len, seq_len);
ggml_set_name(attn_mask, "qwen2_attn_mask");
ggml_set_input(attn_mask);
ggml_tensor * inp_pos = ggml_cast(ctx0, ggml_arange(ctx0, 0, seq_len, 1), GGML_TYPE_I32);
auto add_rope = [&](ggml_tensor * x, const clip_layer &) {
return ggml_rope_ext(ctx0, x, inp_pos, nullptr, d_head,
GGML_ROPE_TYPE_NEOX, 131072, 1000000, 1, 0, 1, 0, 0);
};
build_vit_opts vit_opts;
vit_opts.attn_mask = attn_mask;
ggml_tensor * cur = build_vit(inp, seq_len, NORM_TYPE_RMS, FFN_SILU,
nullptr, add_rope, vit_opts);
cur = ggml_cont(ctx0,
ggml_view_2d(ctx0, cur, cur->ne[0], num_queries, cur->nb[1],
cur->nb[1] * (cur->ne[1] - num_queries)));
ggml_build_forward_expand(gf, cur);
qwen2_out = cur;
}
ggml_tensor * cur;
cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out);
cur = ggml_add(ctx0, cur, model.mm_fc_b);
if (img.add_viewsep) {
cur = ggml_concat(ctx0, cur, model.view_seperator, 1); }
cb(cur, "dsocr2_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}