v8 147.4.0

Rust bindings to V8
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
// Copyright 2023 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Include guard (still compiled once per target)
#if defined(HIGHWAY_HWY_CONTRIB_MATVEC_MATVEC_INL_H_) == \
    defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_MATVEC_MATVEC_INL_H_
#undef HIGHWAY_HWY_CONTRIB_MATVEC_MATVEC_INL_H_
#else
#define HIGHWAY_HWY_CONTRIB_MATVEC_MATVEC_INL_H_
#endif

#include <stddef.h>
#include <stdint.h>

#include "hwy/cache_control.h"
#include "hwy/contrib/thread_pool/thread_pool.h"
#include "hwy/highway.h"

HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {

template <typename TA, typename TB>
TA AddScalar(TA a, TB b) {
  return ConvertScalarTo<TA>(ConvertScalarTo<float>(a) +
                             ConvertScalarTo<float>(b));
}

template <size_t kOuter, size_t kInner, typename T, bool kAdd>
HWY_NOINLINE void MatVecAddImpl(const T* HWY_RESTRICT mat,
                                const T* HWY_RESTRICT vec,
                                const T* HWY_RESTRICT add, T* HWY_RESTRICT out,
                                hwy::ThreadPool& pool) {
  (void)add;

  // Process multiple rows at a time so that we write multiples of a cache line
  // to avoid false sharing (>= 64). 128 is better than 256. 512 has too little
  // parallelization potential.
  constexpr size_t kChunkSize2 = 64 / sizeof(T);
  const uint64_t num_chunks = static_cast<uint64_t>(kOuter / kChunkSize2);

  const ScalableTag<T> d;
  const size_t N = Lanes(d);
  // Required for Stream loop, otherwise we might have partial vectors.
  HWY_DASSERT(kChunkSize2 >= N);
  pool.Run(0, num_chunks,
           [&](const uint64_t chunk, size_t /*thread*/) HWY_ATTR {
             // MSVC workaround: duplicate to ensure constexpr.
             constexpr size_t kChunkSize = 64 / sizeof(T);
             // Software write-combining to avoid cache pollution from out.
             // Although `out` may be used later, keeping it out of the cache
             // now and avoiding RFOs is a consistent 5% overall win.
             HWY_ALIGN T buf[kChunkSize];

             // Only handle entire chunks here because the Stream is not masked.
             // Remaining rows are handled after the pool.Run.
             const size_t begin = static_cast<size_t>(chunk * kChunkSize);
             for (size_t idx_row = 0; idx_row < kChunkSize; ++idx_row) {
               auto sum0 = Zero(d);
               auto sum1 = Zero(d);
               // 4x unrolling barely helps SKX but likely helps Arm V2.
               auto sum2 = Zero(d);
               auto sum3 = Zero(d);

               const T* HWY_RESTRICT row = &mat[(begin + idx_row) * kInner];
               size_t i = 0;
               // No clear win from prefetching from the next 1..3 rows.
               // clflush &row[i] is slow, clflushopt less so but not helping.
               HWY_UNROLL(1)
               for (; i + 4 * N <= kInner; i += 4 * N) {
                 const auto a0 = LoadU(d, row + i + 0 * N);
                 const auto v0 = LoadU(d, vec + i + 0 * N);
                 sum0 = MulAdd(a0, v0, sum0);

                 const auto a1 = LoadU(d, row + i + 1 * N);
                 const auto v1 = LoadU(d, vec + i + 1 * N);
                 sum1 = MulAdd(a1, v1, sum1);

                 const auto a2 = LoadU(d, row + i + 2 * N);
                 const auto v2 = LoadU(d, vec + i + 2 * N);
                 sum2 = MulAdd(a2, v2, sum2);

                 const auto a3 = LoadU(d, row + i + 3 * N);
                 const auto v3 = LoadU(d, vec + i + 3 * N);
                 sum3 = MulAdd(a3, v3, sum3);
               }
               // Last entire vectors
               for (; i + N <= kInner; i += N) {
                 const auto a0 = LoadU(d, row + i);
                 const auto v0 = LoadU(d, vec + i);
                 sum0 = MulAdd(a0, v0, sum0);
               }
               const size_t remainder = kInner - i;
               if (remainder != 0) {
                 const auto a0 = LoadN(d, row + i, remainder);
                 const auto v0 = LoadN(d, vec + i, remainder);
                 sum1 = MulAdd(a0, v0, sum1);
               }
               // Reduction tree: sum of all accumulators, then their lanes
               sum2 = Add(sum2, sum3);
               sum0 = Add(sum0, sum1);
               sum0 = Add(sum0, sum2);
               buf[idx_row] = ReduceSum(d, sum0);
               HWY_IF_CONSTEXPR(kAdd) {
                 buf[idx_row] = AddScalar(buf[idx_row], add[begin + idx_row]);
               }
             }  // idx_row
             HWY_UNROLL(4)  // 1..4 iterations
             for (size_t i = 0; i != kChunkSize; i += N) {
               Stream(Load(d, buf + i), d, out + begin + i);
             }
           });
  hwy::FlushStream();

  // Handle remainder rows which are not a multiple of the chunk size.
  for (size_t r = num_chunks * kChunkSize2; r < kOuter; ++r) {
    auto sum0 = Zero(d);

    const T* HWY_RESTRICT row = &mat[r * kInner];
    size_t i = 0;
    HWY_UNROLL(1)
    for (; i + N <= kInner; i += N) {
      const auto a0 = LoadU(d, row + i);
      const auto v0 = LoadU(d, vec + i);
      sum0 = MulAdd(a0, v0, sum0);
    }
    const size_t remainder = kInner - i;
    if (remainder != 0) {
      const auto a0 = LoadN(d, row + i, remainder);
      const auto v0 = LoadN(d, vec + i, remainder);
      sum0 = MulAdd(a0, v0, sum0);
    }
    out[r] = ReduceSum(d, sum0);
    HWY_IF_CONSTEXPR(kAdd) { out[r] = AddScalar(out[r], add[r]); }
  }  // r
}

// Multiplies mat with vec, adds add and puts the result in out.
//
// mat is a (kOuter, kInner)-shaped array, where element [i,j] is located at
// index i * kInner + j.
//
// vec is a (kInner,)-shaped array.
//
// add is a (kOuter,)-shaped array.
//
// out is a (kOuter,)-shaped array that will set to mat @ vec + add.
template <size_t kOuter, size_t kInner, typename T>
HWY_NOINLINE void MatVecAdd(const T* HWY_RESTRICT mat,
                            const T* HWY_RESTRICT vec,
                            const T* HWY_RESTRICT add, T* HWY_RESTRICT out,
                            hwy::ThreadPool& pool) {
  MatVecAddImpl<kOuter, kInner, T, true>(mat, vec, add, out, pool);
}

// Multiplies mat with vec and puts the result in out.
//
// mat is a (kOuter, kInner)-shaped array, where element [i,j] is located at
// index i * kInner + j.
//
// vec is a (kInner,)-shaped array.
//
// out is a (kOuter,)-shaped array that will set to mat @ vec.
template <size_t kOuter, size_t kInner, typename T>
HWY_NOINLINE void MatVec(const T* HWY_RESTRICT mat, const T* HWY_RESTRICT vec,
                         T* HWY_RESTRICT out, hwy::ThreadPool& pool) {
  MatVecAddImpl<kOuter, kInner, T, false>(mat, vec, /*add=*/nullptr, out, pool);
}

// This target lacks too many ops required in our implementation, use
// HWY_EMU128 instead.
#if HWY_TARGET != HWY_SCALAR

// Specialization for bf16 matrix, which halves memory bandwidth requirements.
template <size_t kOuter, size_t kInner, bool kAdd>
HWY_NOINLINE void MatVecAddImpl(const hwy::bfloat16_t* HWY_RESTRICT mat,
                                const float* HWY_RESTRICT vec,
                                const float* HWY_RESTRICT add,
                                float* HWY_RESTRICT out,
                                hwy::ThreadPool& pool) {
  // Process multiple rows at a time so that we write multiples of a cache line
  // to avoid false sharing (>= 64). 128 is better than 256. 512 has too little
  // parallelization potential.
  constexpr size_t kChunkSize2 = 64 / sizeof(float);
  const uint64_t num_chunks = static_cast<uint64_t>(kOuter / kChunkSize2);

  const ScalableTag<float> d;
  const Repartition<hwy::bfloat16_t, decltype(d)> d16;
  // In the remainder loop, we only process a single f32 vector, so load half
  // vectors of bf16 to avoid overrun.
  const Half<decltype(d16)> d16h;
  using V = Vec<decltype(d)>;
  using V16 = Vec<decltype(d16)>;
  using V16H = Vec<decltype(d16h)>;
  const size_t N = Lanes(d);
  // Required for Stream loop, otherwise we might have partial vectors.
  HWY_DASSERT(kChunkSize2 >= N);
  pool.Run(0, num_chunks,
           [&](const uint64_t chunk, size_t /*thread*/) HWY_ATTR {
             // MSVC workaround: duplicate to ensure constexpr.
             constexpr size_t kChunkSize = 64 / sizeof(float);
             // Software write-combining to avoid cache pollution from out.
             // Although `out` may be used later, keeping it out of the cache
             // now and avoiding RFOs is a consistent 5% overall win.
             HWY_ALIGN float buf[kChunkSize];

             // Only handle entire chunks here because the Stream is not masked.
             // Remaining rows are handled after the pool.Run.
             const size_t begin = static_cast<size_t>(chunk * kChunkSize);
             for (size_t idx_row = 0; idx_row < kChunkSize; ++idx_row) {
               auto sum0 = Zero(d);
               auto sum1 = Zero(d);
               // 4x unrolling barely helps SKX but likely helps Arm V2.
               auto sum2 = Zero(d);
               auto sum3 = Zero(d);

               const hwy::bfloat16_t* HWY_RESTRICT row =
                   &mat[(begin + idx_row) * kInner];
               size_t i = 0;
               // No clear win from prefetching from the next 1..3 rows.
               // clflush &row[i] is slow, clflushopt less so but not helping.
               HWY_UNROLL(1)
               for (; i + 4 * N <= kInner; i += 4 * N) {
                 const V16 b0 = LoadU(d16, row + i + 0 * N);
                 const V a0 = PromoteLowerTo(d, b0);
                 const V a1 = PromoteUpperTo(d, b0);

                 const V16 b1 = LoadU(d16, row + i + 2 * N);
                 const V a2 = PromoteLowerTo(d, b1);
                 const V a3 = PromoteUpperTo(d, b1);

                 const V v0 = LoadU(d, vec + i + 0 * N);
                 sum0 = MulAdd(a0, v0, sum0);

                 const V v1 = LoadU(d, vec + i + 1 * N);
                 sum1 = MulAdd(a1, v1, sum1);

                 const V v2 = LoadU(d, vec + i + 2 * N);
                 sum2 = MulAdd(a2, v2, sum2);

                 const V v3 = LoadU(d, vec + i + 3 * N);
                 sum3 = MulAdd(a3, v3, sum3);
               }
               // Last entire vectors
               for (; i + N <= kInner; i += N) {
                 const V16H b0 = LoadU(d16h, row + i);
                 const V a0 = PromoteTo(d, b0);
                 const V v0 = LoadU(d, vec + i);
                 sum0 = MulAdd(a0, v0, sum0);
               }
               const size_t remainder = kInner - i;
               if (remainder != 0) {
                 const V16H b0 = LoadN(d16h, row + i, remainder);
                 const V a0 = PromoteTo(d, b0);
                 const V v0 = LoadN(d, vec + i, remainder);
                 sum1 = MulAdd(a0, v0, sum1);
               }
               // Reduction tree: sum of all accumulators, then their lanes
               sum2 = Add(sum2, sum3);
               sum0 = Add(sum0, sum1);
               sum0 = Add(sum0, sum2);
               buf[idx_row] = ReduceSum(d, sum0);
               HWY_IF_CONSTEXPR(kAdd) {
                 buf[idx_row] = AddScalar(buf[idx_row], add[begin + idx_row]);
               }
             }  // idx_row
             HWY_UNROLL(4)  // 1..4 iterations
             for (size_t i = 0; i != kChunkSize; i += N) {
               Stream(Load(d, buf + i), d, out + begin + i);
             }
           });
  hwy::FlushStream();

  // Handle remainder rows which are not a multiple of the chunk size.
  for (size_t r = num_chunks * kChunkSize2; r < kOuter; ++r) {
    auto sum0 = Zero(d);

    const hwy::bfloat16_t* HWY_RESTRICT row = &mat[r * kInner];
    size_t i = 0;
    HWY_UNROLL(1)
    for (; i + N <= kInner; i += N) {
      const V16H b0 = LoadU(d16h, row + i);
      const V a0 = PromoteTo(d, b0);
      const V v0 = LoadU(d, vec + i);
      sum0 = MulAdd(a0, v0, sum0);
    }
    const size_t remainder = kInner - i;
    if (remainder != 0) {
      const V16H b0 = LoadN(d16h, row + i, remainder);
      const V a0 = PromoteTo(d, b0);
      const V v0 = LoadN(d, vec + i, remainder);
      sum0 = MulAdd(a0, v0, sum0);
    }
    out[r] = ReduceSum(d, sum0);
    HWY_IF_CONSTEXPR(kAdd) { out[r] = AddScalar(out[r], add[r]); }
  }  // r
}

template <size_t kOuter, size_t kInner>
HWY_NOINLINE void MatVecAdd(const hwy::bfloat16_t* HWY_RESTRICT mat,
                            const float* HWY_RESTRICT vec,
                            const float* HWY_RESTRICT add,
                            float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
  MatVecAddImpl<kOuter, kInner, true>(mat, vec, add, out, pool);
}

template <size_t kOuter, size_t kInner>
HWY_NOINLINE void MatVec(const hwy::bfloat16_t* HWY_RESTRICT mat,
                         const float* HWY_RESTRICT vec, float* HWY_RESTRICT out,
                         hwy::ThreadPool& pool) {
  MatVecAddImpl<kOuter, kInner, false>(mat, vec, /*add=*/nullptr, out, pool);
}

// Both mat and vec are bf16.
template <size_t kOuter, size_t kInner, bool kAdd>
HWY_NOINLINE void MatVecAddImpl(const hwy::bfloat16_t* HWY_RESTRICT mat,
                                const hwy::bfloat16_t* HWY_RESTRICT vec,
                                const hwy::bfloat16_t* HWY_RESTRICT add,
                                float* HWY_RESTRICT out,
                                hwy::ThreadPool& pool) {
  // Process multiple rows at a time so that we write multiples of a cache line
  // to avoid false sharing (>= 64). 128 is better than 256. 512 has too little
  // parallelization potential.
  constexpr size_t kChunkSize2 = 64 / sizeof(bfloat16_t);
  const uint64_t num_chunks = static_cast<uint64_t>(kOuter / kChunkSize2);

  const ScalableTag<float> df;
  const Repartition<hwy::bfloat16_t, decltype(df)> d16;
  using V16 = Vec<decltype(d16)>;
  const size_t N = Lanes(d16);
  // Required for Stream loop, otherwise we might have partial vectors.
  HWY_DASSERT(kChunkSize2 >= N);
  pool.Run(0, num_chunks,
           [&](const uint64_t chunk, size_t /*thread*/) HWY_ATTR {
             // MSVC workaround: duplicate to ensure constexpr.
             constexpr size_t kChunkSize = 64 / sizeof(bfloat16_t);
             // Software write-combining to avoid cache pollution from out.
             // Although `out` may be used later, keeping it out of the cache
             // now and avoiding RFOs is a consistent 5% overall win.
             HWY_ALIGN float buf[kChunkSize];

             // Only handle entire chunks here because the Stream is not masked.
             // Remaining rows are handled after the pool.Run.
             const size_t begin = static_cast<size_t>(chunk * kChunkSize);
             for (size_t idx_row = 0; idx_row < kChunkSize; ++idx_row) {
               auto sum0 = Zero(df);
               auto sum1 = Zero(df);
               auto sum2 = Zero(df);
               auto sum3 = Zero(df);

               const hwy::bfloat16_t* HWY_RESTRICT row =
                   &mat[(begin + idx_row) * kInner];
               size_t i = 0;
               // No clear win from prefetching from the next 1..3 rows.
               // clflush &row[i] is slow, clflushopt less so but not helping.
               HWY_UNROLL(1)
               for (; i + 2 * N <= kInner; i += 2 * N) {
                 const V16 b0 = LoadU(d16, row + i + 0 * N);
                 const V16 b1 = LoadU(d16, row + i + 1 * N);
                 const V16 v0 = LoadU(d16, vec + i + 0 * N);
                 const V16 v1 = LoadU(d16, vec + i + 1 * N);
                 sum0 = ReorderWidenMulAccumulate(df, b0, v0, sum0, sum1);
                 sum2 = ReorderWidenMulAccumulate(df, b1, v1, sum2, sum3);
               }
               // Last entire vector
               for (; i + N <= kInner; i += N) {
                 const V16 b0 = LoadU(d16, row + i);
                 const V16 v0 = LoadU(d16, vec + i);
                 sum0 = ReorderWidenMulAccumulate(df, b0, v0, sum0, sum1);
               }
               const size_t remainder = kInner - i;
               if (remainder != 0) {
                 const V16 b0 = LoadN(d16, row + i, remainder);
                 const V16 v0 = LoadN(d16, vec + i, remainder);
                 sum2 = ReorderWidenMulAccumulate(df, b0, v0, sum2, sum3);
               }
               // Reduction tree: sum of all accumulators, then their lanes
               sum0 = Add(sum0, sum1);
               sum2 = Add(sum2, sum3);
               sum0 = Add(sum0, sum2);
               buf[idx_row] = ReduceSum(df, sum0);
               HWY_IF_CONSTEXPR(kAdd) {
                 buf[idx_row] = AddScalar(buf[idx_row], add[begin + idx_row]);
               }
             }  // idx_row
             HWY_UNROLL(4)  // 1..4 iterations
             for (size_t i = 0; i != kChunkSize; i += N / 2) {
               Stream(Load(df, buf + i), df, out + begin + i);
             }
           });
  hwy::FlushStream();

  // Handle remainder rows which are not a multiple of the chunk size.
  for (size_t r = num_chunks * kChunkSize2; r < kOuter; ++r) {
    auto sum0 = Zero(df);
    auto sum1 = Zero(df);

    const hwy::bfloat16_t* HWY_RESTRICT row = &mat[r * kInner];
    size_t i = 0;
    HWY_UNROLL(1)
    for (; i + N <= kInner; i += N) {
      const V16 b0 = LoadU(d16, row + i);
      const V16 v0 = LoadU(d16, vec + i);
      sum0 = ReorderWidenMulAccumulate(df, b0, v0, sum0, sum1);
    }
    const size_t remainder = kInner - i;
    if (remainder != 0) {
      const V16 b0 = LoadN(d16, row + i, remainder);
      const V16 v0 = LoadN(d16, vec + i, remainder);
      sum0 = ReorderWidenMulAccumulate(df, b0, v0, sum0, sum1);
    }
    out[r] = ReduceSum(df, Add(sum0, sum1));
    HWY_IF_CONSTEXPR(kAdd) { out[r] = AddScalar(out[r], add[r]); }
  }  // r
}

template <size_t kOuter, size_t kInner>
HWY_NOINLINE void MatVecAdd(const hwy::bfloat16_t* HWY_RESTRICT mat,
                            const hwy::bfloat16_t* HWY_RESTRICT vec,
                            const hwy::bfloat16_t* HWY_RESTRICT add,
                            float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
  MatVecAddImpl<kOuter, kInner, true>(mat, vec, add, out, pool);
}

template <size_t kOuter, size_t kInner>
HWY_NOINLINE void MatVec(const hwy::bfloat16_t* HWY_RESTRICT mat,
                         const hwy::bfloat16_t* HWY_RESTRICT vec,
                         float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
  MatVecAddImpl<kOuter, kInner, false>(mat, vec, /*add=*/nullptr, out, pool);
}

#endif  // HWY_TARGET != HWY_SCALAR

// NOLINTNEXTLINE(google-readability-namespace-comments)
}  // namespace HWY_NAMESPACE
}  // namespace hwy
HWY_AFTER_NAMESPACE();

#endif  // HIGHWAY_HWY_CONTRIB_MATVEC_MATVEC_INL_H_