1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
// Copyright 2023 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_TIMER_H_
#define HIGHWAY_HWY_TIMER_H_
// Platform-specific timer functions. Provides Now() and functions for
// interpreting and converting Ticks.
#include <stdint.h>
#include <time.h> // clock_gettime
#include "hwy/base.h"
#if defined(_WIN32) || defined(_WIN64)
#ifndef NOMINMAX
#define NOMINMAX
#endif // NOMINMAX
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif // WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif
#if defined(__APPLE__)
#include <mach/mach.h>
#include <mach/mach_time.h>
#endif
#if defined(__HAIKU__)
#include <OS.h>
#endif
#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
#include <sys/platform/ppc.h> // NOLINT __ppc_get_timebase_freq
#endif
#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
#include <intrin.h>
#endif
namespace hwy {
namespace platform {
// Returns current timestamp [in seconds] relative to an unspecified origin.
// Features: monotonic (no negative elapsed time), steady (unaffected by system
// time changes), high-resolution (on the order of microseconds).
// Uses InvariantTicksPerSecond and the baseline version of timer::Start().
HWY_DLLEXPORT double Now();
// Functions related to `Ticks` below.
// Returns whether it is safe to call timer::Stop without executing an illegal
// instruction; if false, fills cpu100 (a pointer to a 100 character buffer)
// via GetCpuString().
HWY_DLLEXPORT bool HaveTimerStop(char* cpu100);
// Returns tick rate, useful for converting timer::Ticks to seconds. Invariant
// means the tick counter frequency is independent of CPU throttling or sleep.
// This call may be expensive, callers should cache the result.
HWY_DLLEXPORT double InvariantTicksPerSecond();
// Returns ticks elapsed in back to back timer calls, i.e. a function of the
// timer resolution (minimum measurable difference) and overhead.
// This call is expensive, callers should cache the result.
HWY_DLLEXPORT uint64_t TimerResolution();
// Returns false if no detailed description is available, otherwise fills
// `cpu100` with up to 100 characters (including \0) identifying the CPU model.
HWY_DLLEXPORT bool GetCpuString(char* cpu100);
} // namespace platform
struct Timestamp {
Timestamp() { t = platform::Now(); }
double t;
};
static inline double SecondsSince(const Timestamp& t0) {
const Timestamp t1;
return t1.t - t0.t;
}
// Low-level Start/Stop functions, previously in timer-inl.h.
namespace timer {
// Ticks := platform-specific timer values (CPU cycles on x86). Must be
// unsigned to guarantee wraparound on overflow.
using Ticks = uint64_t;
// Start/Stop return absolute timestamps and must be placed immediately before
// and after the region to measure. We provide separate Start/Stop functions
// because they use different fences.
//
// Background: RDTSC is not 'serializing'; earlier instructions may complete
// after it, and/or later instructions may complete before it. 'Fences' ensure
// regions' elapsed times are independent of such reordering. The only
// documented unprivileged serializing instruction is CPUID, which acts as a
// full fence (no reordering across it in either direction). Unfortunately
// the latency of CPUID varies wildly (perhaps made worse by not initializing
// its EAX input). Because it cannot reliably be deducted from the region's
// elapsed time, it must not be included in the region to measure (i.e.
// between the two RDTSC).
//
// The newer RDTSCP is sometimes described as serializing, but it actually
// only serves as a half-fence with release semantics. Although all
// instructions in the region will complete before the final timestamp is
// captured, subsequent instructions may leak into the region and increase the
// elapsed time. Inserting another fence after the final `RDTSCP` would prevent
// such reordering without affecting the measured region.
//
// Fortunately, such a fence exists. The LFENCE instruction is only documented
// to delay later loads until earlier loads are visible. However, Intel's
// reference manual says it acts as a full fence (waiting until all earlier
// instructions have completed, and delaying later instructions until it
// completes). AMD assigns the same behavior to MFENCE.
//
// We need a fence before the initial RDTSC to prevent earlier instructions
// from leaking into the region, and arguably another after RDTSC to avoid
// region instructions from completing before the timestamp is recorded.
// When surrounded by fences, the additional `RDTSCP` half-fence provides no
// benefit, so the initial timestamp can be recorded via RDTSC, which has
// lower overhead than `RDTSCP` because it does not read TSC_AUX. In summary,
// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
//
// Using Start+Start leads to higher variance and overhead than Stop+Stop.
// However, Stop+Stop includes an LFENCE in the region measurements, which
// adds a delay dependent on earlier loads. The combination of Start+Stop
// is faster than Start+Start and more consistent than Stop+Stop because
// the first LFENCE already delayed subsequent loads before the measured
// region. This combination seems not to have been considered in prior work:
// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
//
// Note: performance counters can measure 'exact' instructions-retired or
// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
// requires fences. Unfortunately, it is not accessible on all OSes and we
// prefer to avoid kernel-mode drivers. Performance counters are also affected
// by several under/over-count errata, so we use the TSC instead.
// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
// divide by InvariantTicksPerSecond.
static HWY_INLINE Ticks Start() {
Ticks t;
#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
// pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
asm volatile("mrs %0, cntvct_el0" : "=r"(t));
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
_ReadWriteBarrier();
_mm_lfence();
_ReadWriteBarrier();
t = __rdtsc();
_ReadWriteBarrier();
_mm_lfence();
_ReadWriteBarrier();
#elif HWY_ARCH_X86_64
asm volatile(
"lfence\n\t"
"rdtsc\n\t"
"shl $32, %%rdx\n\t"
"or %%rdx, %0\n\t"
"lfence"
: "=a"(t)
:
// "memory" avoids reordering. rdx = TSC >> 32.
// "cc" = flags modified by SHL.
: "rdx", "memory", "cc");
#elif HWY_ARCH_RISCV
asm volatile("fence; rdtime %0" : "=r"(t));
#elif defined(_WIN32) || defined(_WIN64)
LARGE_INTEGER counter;
(void)QueryPerformanceCounter(&counter);
t = counter.QuadPart;
#elif defined(__APPLE__)
t = mach_absolute_time();
#elif defined(__HAIKU__)
t = system_time_nsecs(); // since boot
#else // POSIX
timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
#endif
return t;
}
// WARNING: on x86, caller must check `HaveTimerStop()` before using this!
static HWY_INLINE Ticks Stop() {
uint64_t t;
#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
// pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
asm volatile("mrs %0, cntvct_el0" : "=r"(t));
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
_ReadWriteBarrier();
unsigned aux;
t = __rdtscp(&aux);
_ReadWriteBarrier();
_mm_lfence();
_ReadWriteBarrier();
#elif HWY_ARCH_X86_64
// Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
asm volatile(
"rdtscp\n\t"
"shl $32, %%rdx\n\t"
"or %%rdx, %0\n\t"
"lfence"
: "=a"(t)
:
// "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
// "cc" = flags modified by SHL.
: "rcx", "rdx", "memory", "cc");
#else
t = Start();
#endif
return t;
}
} // namespace timer
} // namespace hwy
#endif // HIGHWAY_HWY_TIMER_H_