1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
//
// MIT license
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: MIT
//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
#ifndef GGML_SYCL_MMVQ_HPP
#define GGML_SYCL_MMVQ_HPP
#include "common.hpp"
void ggml_sycl_op_mul_mat_vec_q(
ggml_backend_sycl_context & ctx,
const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
float *dst_dd_i, const int64_t row_low, const int64_t row_high,
const int64_t src1_ncols, const int64_t src1_padded_row_size,
const dpct::queue_ptr &stream);
// Requires standard (non-reorder) block layout for src0.
// Returns false if src0_type isn't handled; caller should fall back.
bool ggml_sycl_mul_mat_vec_q_id(
enum ggml_type src0_type,
const void * vx_base, // start of stacked expert weights
const void * vy, // pre-quantized src1 (Q8_1)
const int32_t * ids_dev, // device-side int32, length n_experts_used
float * dst_base,
int ncols,
int nrows,
int n_experts_used,
size_t expert_weight_stride, // bytes between experts in vx_base
size_t dst_row_stride, // bytes between dst rows
size_t src1_row_stride, // 0 = shared src1, else per-expert stride in bytes
dpct::queue_ptr stream);
#endif // GGML_SYCL_MMVQ_HPP