#pragma once
#include "ggml-openvino-extra.h"
#include "ggml.h"
#include <cstdint>
#include <openvino/op/constant.hpp>
#include <openvino/runtime/tensor.hpp>
void unpack_32_4(const uint8_t* data, uint8_t* dst);
void extract_q4_0_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr);
void extract_q4_1_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr,
bool use_bias = false);
void extract_q8_0_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr);
void unpack_256_4(const uint8_t* data, uint8_t* dst);
void extract_q4_k_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr,
bool use_bias = false);
void extract_q5_k_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr,
bool use_bias = false);
void extract_q6_k_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr);
static constexpr size_t GGML_QUANTIZATION_GROUP_SIZE = 32;
ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
ov::Tensor & scales,
ov::Tensor & zp,
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE,
bool use_bias = false);
ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
ov::Tensor & scales,
ov::Tensor & zp,
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE,
bool use_bias = false);
std::shared_ptr<ov::Node> extract_quantized_weights(
const ggml_tensor * tensor,
const void * data, ov::Tensor & weights,
ov::Tensor & scales,
ov::Tensor & zp,
bool use_bias = false);
std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
const void * data, ExtraQuantType requant_type,
int64_t block_size,
ov::Tensor & weights,
ov::Tensor & scales,
ov::Tensor & zp);
inline const char * extra_quant_type_name(ExtraQuantType t) {
switch (t) {
case ExtraQuantType::F16:
return "F16";
case ExtraQuantType::Q4_0_C:
return "Q4_0_C";
case ExtraQuantType::Q4_0_128:
return "Q4_0_128";
case ExtraQuantType::Q8_0_C:
return "Q8_0_C";
case ExtraQuantType::Q8_0_32:
return "Q8_0_32";
case ExtraQuantType::Q8_1_C:
return "Q8_1_C";
default:
return "unknown";
}
}
struct OvWeight {
std::shared_ptr<ov::Node> weight_node;
ggml_openvino_extracted_layout layout; ov::Tensor weights;
ov::Tensor scales;
ov::Tensor zp;
bool is_quantized() const { return layout.scales_size > 0; }
};
OvWeight process_weight_tensor(
const ggml_tensor * tensor,
const void * data, void * output_base_ptr = nullptr, bool use_bias = false);
void quantize_q4_0(const float * x,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr,
int64_t k,
int64_t qk);
void quantize_q8_1(const float * x,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr,
int64_t k,
int64_t qk);
void quantize_q8_0(const float * x,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr,
int64_t k,
int64_t qk);
namespace ov {
namespace op {
namespace util {
bool get_single_value(const std::shared_ptr<ov::op::v0::Constant>& const_node,
float& value,
bool check_value_range = true);
} } }