#ifndef __T5_HPP__
#define __T5_HPP__
#include <float.h>
#include <limits>
#include <map>
#include <memory>
#include <regex>
#include <sstream>
#include <string>
#include <unordered_map>
#include "darts.h"
#include "ggml_extend.hpp"
#include "json.hpp"
#include "model.h"
class MetaspacePreTokenizer {
private:
std::string replacement;
bool add_prefix_space;
public:
MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true)
: replacement(replacement), add_prefix_space(add_prefix_space) {}
std::string tokenize(const std::string& input) const {
std::string tokens;
std::stringstream ss(input);
if (add_prefix_space) {
tokens += replacement;
}
std::string token;
bool firstToken = true;
while (std::getline(ss, token, ' ')) {
if (!firstToken)
tokens += replacement + token;
else
tokens += token;
firstToken = false;
}
return tokens;
}
};
using EncodeResult = std::vector<std::pair<std::string, int>>;
class T5UniGramTokenizer {
public:
enum Status {
OK,
NO_PIECES_LOADED,
NO_ENTRY_FOUND,
BUILD_DOUBLE_ARRAY_FAILED,
PIECE_ALREADY_DEFINED,
INVLIAD_JSON
};
protected:
MetaspacePreTokenizer pre_tokenizer;
std::vector<std::pair<std::string, float>> piece_score_pairs;
float min_score_ = 0.0;
float max_score_ = 0.0;
std::unique_ptr<Darts::DoubleArray> trie_;
int trie_results_size_;
int unk_id_ = 2;
std::string eos_token_ = "</s>";
int eos_id_ = 1;
int pad_id_ = 0;
Status status_ = OK;
float kUnkPenalty = 10.0;
std::string replacement;
bool add_prefix_space = true;
void InitializePieces(const std::string& json_str) {
nlohmann::json data;
try {
data = nlohmann::json::parse(json_str);
} catch (const nlohmann::json::parse_error& e) {
status_ = INVLIAD_JSON;
return;
}
if (!data.contains("model")) {
status_ = INVLIAD_JSON;
return;
}
nlohmann::json model = data["model"];
if (!model.contains("vocab")) {
status_ = INVLIAD_JSON;
return;
}
if (model.contains("unk_id")) {
unk_id_ = model["unk_id"];
}
replacement = data["pre_tokenizer"]["replacement"];
add_prefix_space = data["pre_tokenizer"]["add_prefix_space"];
pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space);
for (const auto& item : model["vocab"]) {
if (item.size() != 2 || !item[0].is_string() || !item[1].is_number_float()) {
status_ = INVLIAD_JSON;
return;
}
std::string piece = item[0];
if (piece.empty()) {
piece = "<empty_token>";
}
float score = item[1];
piece_score_pairs.emplace_back(piece, score);
}
}
void BuildTrie(std::vector<std::pair<std::string, int>>* pieces) {
if (status_ != OK)
return;
if (pieces->empty()) {
status_ = NO_PIECES_LOADED;
return;
}
sort(pieces->begin(), pieces->end());
std::vector<const char*> key(pieces->size());
std::vector<int> value(pieces->size());
for (size_t i = 0; i < pieces->size(); ++i) {
key[i] = (*pieces)[i].first.data(); value[i] = (*pieces)[i].second; }
trie_ = std::unique_ptr<Darts::DoubleArray>(new Darts::DoubleArray());
if (trie_->build(key.size(), const_cast<char**>(&key[0]), nullptr,
&value[0]) != 0) {
status_ = BUILD_DOUBLE_ARRAY_FAILED;
return;
}
const int kMaxTrieResultsSize = 1024;
std::vector<Darts::DoubleArray::result_pair_type> results(
kMaxTrieResultsSize);
trie_results_size_ = 0;
for (const auto& p : *pieces) {
const int num_nodes = trie_->commonPrefixSearch(
p.first.data(), results.data(), results.size(), p.first.size());
trie_results_size_ = std::max(trie_results_size_, num_nodes);
}
if (trie_results_size_ == 0)
status_ = NO_ENTRY_FOUND;
}
inline float GetScoreInlined(int id) const {
return piece_score_pairs[id].second;
}
inline bool IsUnusedInlined(int id) const {
return false; }
inline bool IsUserDefinedInlined(int id) const {
return false; }
inline size_t OneCharLen(const char* src) const {
return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
}
EncodeResult EncodeOptimized(const std::string& normalized) const {
if (status() != OK || normalized.empty()) {
return {};
}
struct BestPathNode {
int id = -1; float best_path_score =
0; int starts_at =
-1; };
const int size = normalized.size();
const float unk_score = min_score() - kUnkPenalty;
std::vector<BestPathNode> best_path_ends_at(size + 1);
int starts_at = 0;
while (starts_at < size) {
std::size_t node_pos = 0;
std::size_t key_pos = starts_at;
const auto best_path_score_till_here =
best_path_ends_at[starts_at].best_path_score;
bool has_single_node = false;
const int mblen =
std::min<int>(OneCharLen(normalized.data() + starts_at),
size - starts_at);
while (key_pos < size) {
const int ret =
trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1);
if (ret == -2)
break;
if (ret >= 0) {
if (IsUnusedInlined(ret))
continue;
auto& target_node = best_path_ends_at[key_pos];
const auto length = (key_pos - starts_at);
const auto score = IsUserDefinedInlined(ret)
? (length * max_score_ - 0.1)
: GetScoreInlined(ret);
const auto candidate_best_path_score =
score + best_path_score_till_here;
if (target_node.starts_at == -1 ||
candidate_best_path_score > target_node.best_path_score) {
target_node.best_path_score = candidate_best_path_score;
target_node.starts_at = starts_at;
target_node.id = ret;
}
if (!has_single_node && length == mblen) {
has_single_node = true;
}
}
}
if (!has_single_node) {
auto& target_node = best_path_ends_at[starts_at + mblen];
const auto candidate_best_path_score =
unk_score + best_path_score_till_here;
if (target_node.starts_at == -1 ||
candidate_best_path_score > target_node.best_path_score) {
target_node.best_path_score = candidate_best_path_score;
target_node.starts_at = starts_at;
target_node.id = unk_id_;
}
}
starts_at += mblen;
}
EncodeResult results;
int ends_at = size;
while (ends_at > 0) {
const auto& node = best_path_ends_at[ends_at];
results.emplace_back(
normalized.substr(node.starts_at, ends_at - node.starts_at), node.id);
ends_at = node.starts_at;
}
std::reverse(results.begin(), results.end());
return results;
}
public:
explicit T5UniGramTokenizer(bool is_umt5 = false) {
if (is_umt5) {
InitializePieces(ModelLoader::load_umt5_tokenizer_json());
} else {
InitializePieces(ModelLoader::load_t5_tokenizer_json());
}
min_score_ = FLT_MAX;
max_score_ = FLT_MIN;
std::vector<std::pair<std::string, int>> pieces;
for (int i = 0; i < piece_score_pairs.size(); i++) {
const auto& sp = piece_score_pairs[i];
min_score_ = std::min(min_score_, sp.second);
max_score_ = std::max(max_score_, sp.second);
pieces.emplace_back(sp.first, i);
}
BuildTrie(&pieces);
}
~T5UniGramTokenizer(){};
std::string Normalize(const std::string& input) const {
std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " ");
return normalized;
}
std::vector<int> Encode(const std::string& input, bool append_eos_if_not_present = true) const {
std::string normalized = Normalize(input);
normalized = pre_tokenizer.tokenize(normalized);
EncodeResult result = EncodeOptimized(normalized);
if (result.size() > 0 && append_eos_if_not_present) {
auto item = result[result.size() - 1];
if (item.first != eos_token_) {
result.emplace_back(eos_token_, eos_id_);
}
}
std::vector<int> tokens;
for (auto item : result) {
tokens.push_back(item.second);
}
return tokens;
}
void pad_tokens(std::vector<int>& tokens,
std::vector<float>& weights,
std::vector<float>* attention_mask,
size_t max_length = 0,
bool padding = false) {
if (max_length > 0 && padding) {
size_t orig_token_num = tokens.size() - 1;
size_t n = std::ceil(orig_token_num * 1.0 / (max_length - 1));
if (n == 0) {
n = 1;
}
size_t length = max_length * n;
LOG_DEBUG("token length: %llu", length);
std::vector<int> new_tokens;
std::vector<float> new_weights;
std::vector<float> new_attention_mask;
int token_idx = 0;
for (int i = 0; i < length; i++) {
if (token_idx >= orig_token_num) {
break;
}
if (attention_mask != nullptr) {
new_attention_mask.push_back(0.0);
}
if (i % max_length == max_length - 1) {
new_tokens.push_back(eos_id_);
new_weights.push_back(1.0);
} else {
new_tokens.push_back(tokens[token_idx]);
new_weights.push_back(weights[token_idx]);
token_idx++;
}
}
new_tokens.push_back(eos_id_);
new_weights.push_back(1.0);
if (attention_mask != nullptr) {
new_attention_mask.push_back(0.0);
}
tokens = new_tokens;
weights = new_weights;
if (attention_mask != nullptr) {
*attention_mask = new_attention_mask;
}
if (padding) {
int pad_token_id = pad_id_;
tokens.insert(tokens.end(), length - tokens.size(), pad_token_id);
weights.insert(weights.end(), length - weights.size(), 1.0);
if (attention_mask != nullptr) {
attention_mask->insert(attention_mask->end(), length - attention_mask->size(), -HUGE_VALF);
}
}
}
}
float min_score() const { return min_score_; }
float max_score() const { return max_score_; }
Status status() const { return status_; }
};
class T5LayerNorm : public UnaryBlock {
protected:
int64_t hidden_size;
float eps;
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
enum ggml_type wtype = GGML_TYPE_F32;
params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size);
}
public:
T5LayerNorm(int64_t hidden_size,
float eps = 1e-06f)
: hidden_size(hidden_size),
eps(eps) {}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
struct ggml_tensor* w = params["weight"];
x = ggml_rms_norm(ctx, x, eps);
x = ggml_mul(ctx, x, w);
return x;
}
};
struct T5DenseActDense : public UnaryBlock {
public:
T5DenseActDense(int64_t model_dim, int64_t ff_dim) {
blocks["wi"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]);
auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
x = wi->forward(ctx, x);
x = ggml_relu_inplace(ctx, x);
x = wo->forward(ctx, x);
return x;
}
};
struct T5DenseGatedActDense : public UnaryBlock {
public:
T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) {
blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]);
auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]);
auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
auto hidden_gelu = ggml_gelu_inplace(ctx, wi_0->forward(ctx, x));
auto hidden_linear = wi_1->forward(ctx, x);
x = ggml_mul_inplace(ctx, hidden_gelu, hidden_linear);
x = wo->forward(ctx, x);
return x;
}
};
struct T5LayerFF : public UnaryBlock {
public:
T5LayerFF(int64_t model_dim, int64_t ff_dim) {
blocks["DenseReluDense"] = std::shared_ptr<GGMLBlock>(new T5DenseGatedActDense(model_dim, ff_dim));
blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]);
auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
auto forwarded_states = layer_norm->forward(ctx, x);
forwarded_states = DenseReluDense->forward(ctx, forwarded_states);
x = ggml_add_inplace(ctx, forwarded_states, x);
return x;
}
};
class T5Attention : public GGMLBlock {
protected:
int64_t model_dim;
int64_t inner_dim;
int64_t num_heads;
bool using_relative_attention_bias;
int64_t relative_attention_num_buckets = 32;
int64_t relative_attention_max_distance = 128;
public:
T5Attention(int64_t model_dim,
int64_t inner_dim,
int64_t num_heads,
bool using_relative_attention_bias = false)
: model_dim(model_dim),
inner_dim(inner_dim),
num_heads(num_heads),
using_relative_attention_bias(using_relative_attention_bias) {
blocks["q"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
blocks["k"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
blocks["v"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
blocks["o"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, model_dim, false));
if (using_relative_attention_bias) {
blocks["relative_attention_bias"] = std::shared_ptr<GGMLBlock>(new Embedding(relative_attention_num_buckets, num_heads));
}
}
struct ggml_tensor* compute_bias(struct ggml_context* ctx,
struct ggml_tensor* relative_position_bucket) {
auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]);
auto values = relative_attention_bias->forward(ctx, relative_position_bucket); values = ggml_cont(ctx, ggml_permute(ctx, values, 2, 0, 1, 3)); return values;
}
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
ggml_backend_t backend,
struct ggml_tensor* x,
struct ggml_tensor* past_bias = NULL,
struct ggml_tensor* mask = NULL,
struct ggml_tensor* relative_position_bucket = NULL) {
auto q_proj = std::dynamic_pointer_cast<Linear>(blocks["q"]);
auto k_proj = std::dynamic_pointer_cast<Linear>(blocks["k"]);
auto v_proj = std::dynamic_pointer_cast<Linear>(blocks["v"]);
auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o"]);
int64_t n_head = num_heads;
int64_t d_head = inner_dim / n_head;
auto q = q_proj->forward(ctx, x);
auto k = k_proj->forward(ctx, x);
auto v = v_proj->forward(ctx, x);
if (using_relative_attention_bias && relative_position_bucket != NULL) {
past_bias = compute_bias(ctx, relative_position_bucket);
}
if (past_bias != NULL) {
if (mask != NULL) {
mask = ggml_repeat(ctx, mask, past_bias);
mask = ggml_add(ctx, mask, past_bias);
} else {
mask = past_bias;
}
}
k = ggml_scale_inplace(ctx, k, sqrt(d_head));
x = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, mask);
x = out_proj->forward(ctx, x); return {x, past_bias};
}
};
struct T5LayerSelfAttention : public GGMLBlock {
public:
T5LayerSelfAttention(int64_t model_dim,
int64_t inner_dim,
int64_t ff_dim,
int64_t num_heads,
bool using_relative_attention_bias) {
blocks["SelfAttention"] = std::shared_ptr<GGMLBlock>(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias));
blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
}
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
ggml_backend_t backend,
struct ggml_tensor* x,
struct ggml_tensor* past_bias = NULL,
struct ggml_tensor* mask = NULL,
struct ggml_tensor* relative_position_bucket = NULL) {
auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]);
auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
auto normed_hidden_state = layer_norm->forward(ctx, x);
auto ret = SelfAttention->forward(ctx, backend, normed_hidden_state, past_bias, mask, relative_position_bucket);
auto output = ret.first;
past_bias = ret.second;
x = ggml_add_inplace(ctx, output, x);
return {x, past_bias};
}
};
struct T5Block : public GGMLBlock {
public:
T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) {
blocks["layer.0"] = std::shared_ptr<GGMLBlock>(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias));
blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim));
}
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
ggml_backend_t backend,
struct ggml_tensor* x,
struct ggml_tensor* past_bias = NULL,
struct ggml_tensor* mask = NULL,
struct ggml_tensor* relative_position_bucket = NULL) {
auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]);
auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]);
auto ret = layer_0->forward(ctx, backend, x, past_bias, mask, relative_position_bucket);
x = ret.first;
past_bias = ret.second;
x = layer_1->forward(ctx, x);
return {x, past_bias};
}
};
struct T5Stack : public GGMLBlock {
int64_t num_layers;
public:
T5Stack(int64_t num_layers,
int64_t model_dim,
int64_t inner_dim,
int64_t ff_dim,
int64_t num_heads,
bool relative_attention = true)
: num_layers(num_layers) {
for (int i = 0; i < num_layers; i++) {
blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, (!relative_attention || i == 0)));
}
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
}
struct ggml_tensor* forward(struct ggml_context* ctx,
ggml_backend_t backend,
struct ggml_tensor* x,
struct ggml_tensor* past_bias = NULL,
struct ggml_tensor* attention_mask = NULL,
struct ggml_tensor* relative_position_bucket = NULL) {
for (int i = 0; i < num_layers; i++) {
auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);
auto ret = block->forward(ctx, backend, x, past_bias, attention_mask, relative_position_bucket);
x = ret.first;
past_bias = ret.second;
}
auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]);
x = final_layer_norm->forward(ctx, x);
return x;
}
};
struct T5Params {
int64_t num_layers = 24;
int64_t model_dim = 4096;
int64_t ff_dim = 10240;
int64_t num_heads = 64;
int64_t vocab_size = 32128;
bool relative_attention = true;
};
struct T5 : public GGMLBlock {
T5Params params;
public:
T5() {}
T5(T5Params params)
: params(params) {
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(params.num_layers,
params.model_dim,
params.model_dim,
params.ff_dim,
params.num_heads,
params.relative_attention));
blocks["shared"] = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size,
params.model_dim));
}
struct ggml_tensor* forward(struct ggml_context* ctx,
ggml_backend_t backend,
struct ggml_tensor* input_ids,
struct ggml_tensor* past_bias = NULL,
struct ggml_tensor* attention_mask = NULL,
struct ggml_tensor* relative_position_bucket = NULL) {
auto shared = std::dynamic_pointer_cast<Embedding>(blocks["shared"]);
auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]);
auto x = shared->forward(ctx, input_ids);
x = encoder->forward(ctx, backend, x, past_bias, attention_mask, relative_position_bucket);
return x;
}
};
struct T5Runner : public GGMLRunner {
T5Params params;
T5 model;
std::vector<int> relative_position_bucket_vec;
T5Runner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types,
const std::string prefix,
bool is_umt5 = false)
: GGMLRunner(backend, offload_params_to_cpu) {
if (is_umt5) {
params.vocab_size = 256384;
params.relative_attention = false;
}
model = T5(params);
model.init(params_ctx, tensor_types, prefix);
}
std::string get_desc() {
return "t5";
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
model.get_param_tensors(tensors, prefix);
}
struct ggml_tensor* forward(struct ggml_context* ctx,
ggml_backend_t backend,
struct ggml_tensor* input_ids,
struct ggml_tensor* relative_position_bucket,
struct ggml_tensor* attention_mask = NULL) {
size_t N = input_ids->ne[1];
size_t n_token = input_ids->ne[0];
auto hidden_states = model.forward(ctx, backend, input_ids, NULL, attention_mask, relative_position_bucket); return hidden_states;
}
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
struct ggml_tensor* attention_mask = NULL) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
input_ids = to_backend(input_ids);
attention_mask = to_backend(attention_mask);
relative_position_bucket_vec = compute_relative_position_bucket(input_ids->ne[0], input_ids->ne[0]);
auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx,
GGML_TYPE_I32,
input_ids->ne[0],
input_ids->ne[0]);
set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data());
struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, relative_position_bucket, attention_mask);
ggml_build_forward_expand(gf, hidden_states);
return gf;
}
void compute(const int n_threads,
struct ggml_tensor* input_ids,
struct ggml_tensor* attention_mask,
ggml_tensor** output,
ggml_context* output_ctx = NULL) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(input_ids, attention_mask);
};
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
}
static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position,
bool bidirectional = true,
int num_buckets = 32,
int max_distance = 128) {
std::vector<int> relative_buckets(relative_position.size(), 0);
std::vector<int> abs_relative_position = relative_position;
if (bidirectional) {
num_buckets = num_buckets / 2;
for (size_t i = 0; i < relative_position.size(); ++i) {
if (relative_position[i] > 0) {
relative_buckets[i] += num_buckets;
}
abs_relative_position[i] = std::abs(relative_position[i]);
}
} else {
for (size_t i = 0; i < relative_position.size(); ++i) {
abs_relative_position[i] = std::max(-relative_position[i], 0);
}
}
int max_exact = num_buckets / 2;
std::vector<int> relative_position_if_large(relative_position.size(), 0);
for (size_t i = 0; i < relative_position.size(); ++i) {
if (abs_relative_position[i] < max_exact) {
relative_buckets[i] += abs_relative_position[i];
} else {
float log_pos = std::log(static_cast<float>(abs_relative_position[i]) / max_exact);
float log_base = std::log(static_cast<float>(max_distance) / max_exact);
relative_position_if_large[i] = max_exact + static_cast<int>((log_pos / log_base) * (num_buckets - max_exact));
relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1);
relative_buckets[i] += relative_position_if_large[i];
}
}
return relative_buckets;
}
std::vector<int> compute_relative_position_bucket(int query_length,
int key_length) {
std::vector<int> context_position(query_length);
std::vector<int> memory_position(key_length);
for (int i = 0; i < query_length; ++i) {
context_position[i] = i;
}
for (int i = 0; i < key_length; ++i) {
memory_position[i] = i;
}
std::vector<std::vector<int>> relative_position(query_length, std::vector<int>(key_length, 0));
for (int i = 0; i < query_length; ++i) {
for (int j = 0; j < key_length; ++j) {
relative_position[i][j] = memory_position[j] - context_position[i];
}
}
std::vector<int> relative_position_bucket;
for (int i = 0; i < query_length; ++i) {
std::vector<int> result = _relative_position_bucket(relative_position[i], true);
relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end());
}
return relative_position_bucket;
}
};
struct T5Embedder {
T5UniGramTokenizer tokenizer;
T5Runner model;
T5Embedder(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {},
const std::string prefix = "",
bool is_umt5 = false)
: model(backend, offload_params_to_cpu, tensor_types, prefix, is_umt5), tokenizer(is_umt5) {
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
model.get_param_tensors(tensors, prefix);
}
void alloc_params_buffer() {
model.alloc_params_buffer();
}
std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
size_t max_length = 0,
bool padding = false) {
auto parsed_attention = parse_prompt_attention(text);
{
std::stringstream ss;
ss << "[";
for (const auto& item : parsed_attention) {
ss << "['" << item.first << "', " << item.second << "], ";
}
ss << "]";
LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
}
std::vector<int> tokens;
std::vector<float> weights;
for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first;
float curr_weight = item.second;
std::vector<int> curr_tokens = tokenizer.Encode(curr_text, false);
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
weights.insert(weights.end(), curr_tokens.size(), curr_weight);
}
int EOS_TOKEN_ID = 1;
tokens.push_back(EOS_TOKEN_ID);
weights.push_back(1.0);
std::vector<float> attention_mask;
tokenizer.pad_tokens(tokens, weights, &attention_mask, max_length, padding);
return {tokens, weights, attention_mask};
}
void test() {
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); params.mem_buffer = NULL;
params.no_alloc = false;
struct ggml_context* work_ctx = ggml_init(params);
GGML_ASSERT(work_ctx != NULL);
{
std::string text("a lovely cat");
auto tokens_and_weights = tokenize(text, 512, true);
std::vector<int>& tokens = std::get<0>(tokens_and_weights);
std::vector<float>& weights = std::get<1>(tokens_and_weights);
std::vector<float>& masks = std::get<2>(tokens_and_weights);
for (auto token : tokens) {
printf("%d ", token);
}
printf("\n");
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
auto attention_mask = vector_to_ggml_tensor(work_ctx, masks);
struct ggml_tensor* out = NULL;
int t0 = ggml_time_ms();
model.compute(8, input_ids, attention_mask, &out, work_ctx);
int t1 = ggml_time_ms();
print_ggml_tensor(out);
LOG_DEBUG("t5 test done in %dms", t1 - t0);
}
}
static void load_from_file_and_test(const std::string& file_path) {
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_F16;
ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) {
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return;
}
auto tensor_types = model_loader.tensor_storages_types;
for (auto& item : tensor_types) {
if (ends_with(item.first, "weight")) {
item.second = model_data_type;
}
}
std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend, false, tensor_types, "", true));
t5->alloc_params_buffer();
std::map<std::string, ggml_tensor*> tensors;
t5->get_param_tensors(tensors, "");
bool success = model_loader.load_tensors(tensors);
if (!success) {
LOG_ERROR("load tensors from model loader failed");
return;
}
LOG_INFO("t5 model loaded");
t5->test();
}
};
#endif