1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_CACHE_CONTROL_H_
#define HIGHWAY_HWY_CACHE_CONTROL_H_
#include "hwy/aligned_allocator.h" // HWY_ALIGNMENT
#include "hwy/base.h"
// Requires SSE2; fails to compile on 32-bit Clang 7 (see
// https://github.com/gperftools/gperftools/issues/946).
#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
#undef HWY_DISABLE_CACHE_CONTROL
#define HWY_DISABLE_CACHE_CONTROL
#endif
#ifndef HWY_DISABLE_CACHE_CONTROL
// intrin.h is sufficient on MSVC and already included by base.h.
#if HWY_ARCH_X86 && !HWY_COMPILER_MSVC
#include <emmintrin.h> // SSE2
#include <xmmintrin.h> // _mm_prefetch
#elif HWY_ARCH_ARM_A64
#include <arm_acle.h>
#endif
#endif // HWY_DISABLE_CACHE_CONTROL
namespace hwy {
// Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
#define HWY_STREAM_MULTIPLE 16
// The following functions may also require an attribute.
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
#define HWY_ATTR_CACHE __attribute__((target("sse2")))
#else
#define HWY_ATTR_CACHE
#endif
// Windows.h #defines this, which causes infinite recursion. Temporarily
// undefine to avoid conflict with our function.
// TODO(janwas): remove when this function is removed.
#pragma push_macro("LoadFence")
#undef LoadFence
// Delays subsequent loads until prior loads are visible. Beware of potentially
// differing behavior across architectures and vendors: on Intel but not
// AMD CPUs, also serves as a full fence (waits for all prior instructions to
// complete).
HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
_mm_lfence();
#endif
}
// TODO(janwas): remove when this function is removed. (See above.)
#pragma pop_macro("LoadFence")
// Overwrites "to" while attempting to bypass the cache (read-for-ownership).
// Both pointers must be aligned.
static HWY_INLINE void StreamCacheLine(const uint64_t* HWY_RESTRICT from,
uint64_t* HWY_RESTRICT to) {
HWY_DASSERT(IsAligned(from));
HWY_DASSERT(IsAligned(to));
#if HWY_COMPILER_CLANG && !defined(HWY_DISABLE_CACHE_CONTROL)
for (size_t i = 0; i < HWY_ALIGNMENT / sizeof(uint64_t); ++i) {
__builtin_nontemporal_store(from[i], to + i);
}
#else
hwy::CopyBytes(from, to, HWY_ALIGNMENT);
#endif
}
// Ensures values written by previous `Stream` calls are visible on the current
// core. This is NOT sufficient for synchronizing across cores; when `Stream`
// outputs are to be consumed by other core(s), the producer must publish
// availability (e.g. via mutex or atomic_flag) after `FlushStream`.
HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
_mm_sfence();
#endif
}
// Optionally begins loading the cache line containing "p" to reduce latency of
// subsequent actual loads.
template <typename T>
HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
(void)p;
#ifndef HWY_DISABLE_CACHE_CONTROL
#if HWY_ARCH_X86
_mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
#elif HWY_COMPILER_GCC // includes clang
// Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
// desirable, so use the default 3 (keep in caches).
__builtin_prefetch(p, /*write=*/0, /*hint=*/3);
#endif
#endif // HWY_DISABLE_CACHE_CONTROL
}
// Invalidates and flushes the cache line containing "p", if possible.
HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
_mm_clflush(p);
#else
(void)p;
#endif
}
// Hints that we are inside a spin loop and potentially reduces power
// consumption and coherency traffic. For example, x86 avoids multiple
// outstanding load requests, which reduces the memory order violation penalty
// when exiting the loop.
HWY_INLINE HWY_ATTR_CACHE void Pause() {
#ifndef HWY_DISABLE_CACHE_CONTROL
#if HWY_ARCH_X86
_mm_pause();
#elif HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG
// This is documented in ACLE and the YIELD instruction is also available in
// Armv7, but the intrinsic is broken for Armv7 clang, hence A64 only.
__yield();
#elif HWY_ARCH_ARM && HWY_COMPILER_GCC // includes clang
__asm__ volatile("yield" ::: "memory");
#elif HWY_ARCH_PPC && HWY_COMPILER_GCC // includes clang
__asm__ volatile("or 27,27,27" ::: "memory");
#endif
#endif // HWY_DISABLE_CACHE_CONTROL
}
} // namespace hwy
#endif // HIGHWAY_HWY_CACHE_CONTROL_H_