#include "hwy/perf_counters.h"
#include "hwy/detect_compiler_arch.h"
#if HWY_OS_LINUX || HWY_IDE
#include <errno.h>
#include <fcntl.h>
#include <linux/perf_event.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/utsname.h>
#include <unistd.h>
#include <string>
#include <vector>
#include "hwy/base.h"
#include "hwy/bit_set.h"
#include "hwy/timer.h"
#endif
namespace hwy {
namespace platform {
#if HWY_OS_LINUX || HWY_IDE
namespace {
bool PerfCountersSupported() {
struct stat s;
return stat("/proc/sys/kernel/perf_event_paranoid", &s) == 0;
}
uint64_t RefCyclesOrCycles() {
const uint32_t ref_cycles = PERF_COUNT_HW_REF_CPU_CYCLES;
utsname buf;
if (uname(&buf) != 0) return ref_cycles;
if (std::string(buf.sysname) != "Linux") return ref_cycles;
int major, minor;
if (sscanf(buf.release, "%d.%d", &major, &minor) != 2) return ref_cycles;
if (major > 6 || (major == 6 && minor >= 9)) return ref_cycles;
char cpu100[100];
if (!GetCpuString(cpu100)) return ref_cycles;
if (std::string(cpu100).rfind("AMD EPYC", 0) != 0) return ref_cycles;
return PERF_COUNT_HW_CPU_CYCLES;
}
struct CounterConfig { uint64_t config;
uint32_t type;
PerfCounters::Counter c;
};
std::vector<CounterConfig> AllCounterConfigs() {
constexpr uint32_t kHW = PERF_TYPE_HARDWARE;
constexpr uint32_t kSW = PERF_TYPE_SOFTWARE;
constexpr uint32_t kC = PERF_TYPE_HW_CACHE;
constexpr uint64_t kL3 = PERF_COUNT_HW_CACHE_LL;
constexpr uint64_t kLoad = uint64_t{PERF_COUNT_HW_CACHE_OP_READ} << 8;
constexpr uint64_t kStore = uint64_t{PERF_COUNT_HW_CACHE_OP_WRITE} << 8;
constexpr uint64_t kAcc = uint64_t{PERF_COUNT_HW_CACHE_RESULT_ACCESS} << 16;
return {{RefCyclesOrCycles(), kHW, PerfCounters::kRefCycles},
{PERF_COUNT_HW_INSTRUCTIONS, kHW, PerfCounters::kInstructions},
{PERF_COUNT_SW_PAGE_FAULTS, kSW, PerfCounters::kPageFaults},
{kL3 | kLoad | kAcc, kC, PerfCounters::kL3Loads},
{kL3 | kStore | kAcc, kC, PerfCounters::kL3Stores},
{PERF_COUNT_HW_BRANCH_INSTRUCTIONS, kHW, PerfCounters::kBranches},
{PERF_COUNT_HW_BRANCH_MISSES, kHW, PerfCounters::kBranchMispredicts},
{PERF_COUNT_HW_BUS_CYCLES, kHW, PerfCounters::kBusCycles},
{PERF_COUNT_SW_CPU_MIGRATIONS, kSW, PerfCounters::kMigrations},
{PERF_COUNT_HW_CACHE_REFERENCES, kHW, PerfCounters::kCacheRefs},
{PERF_COUNT_HW_CACHE_MISSES, kHW, PerfCounters::kCacheMisses}};
}
size_t& PackedIdx(PerfCounters::Counter c) {
static size_t packed_idx[64];
return packed_idx[static_cast<size_t>(c)];
}
class PMU {
static perf_event_attr MakeAttr(const CounterConfig& cc) {
perf_event_attr attr = {};
attr.type = cc.type;
attr.size = sizeof(attr);
attr.config = cc.config;
attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
PERF_FORMAT_TOTAL_TIME_RUNNING | PERF_FORMAT_GROUP;
attr.exclude_kernel = 1; attr.exclude_hv = 1; return attr;
}
static int SysPerfEventOpen(const CounterConfig& cc, int leader_fd) {
perf_event_attr attr = MakeAttr(cc);
const int pid = 0; const int cpu = -1; for (int retry = 0; retry < 10; ++retry) {
const int flags = 0;
const int fd = static_cast<int>(
syscall(__NR_perf_event_open, &attr, pid, cpu, leader_fd, flags));
if (!(fd == -1 && errno == EINTR)) return fd;
}
HWY_WARN("perf_event_open retries were insufficient.");
return -1;
}
static bool ReadBytes(int fd, ssize_t size, void* to) {
uint8_t* bytes = reinterpret_cast<uint8_t*>(to);
ssize_t pos = 0;
for (int retry = 0; retry < 10; ++retry) {
const ssize_t bytes_read =
read(fd, bytes + pos, static_cast<size_t>(size - pos));
if (HWY_UNLIKELY(bytes_read <= 0)) {
if (errno == EINTR) continue;
HWY_WARN("perf read() failed, errno %d.", errno);
return false;
}
pos += bytes_read;
HWY_ASSERT(pos <= size);
if (HWY_LIKELY(pos == size)) return true; }
HWY_WARN("perf read() wanted %d bytes, got %d.", static_cast<int>(size),
static_cast<int>(pos));
return false;
}
static constexpr size_t kMaxEventsPerGroup = PerfCounters::kCapacity;
#pragma pack(push, 1)
struct Buf {
uint64_t num_events;
uint64_t time_enabled;
uint64_t time_running;
uint64_t values[kMaxEventsPerGroup];
};
#pragma pack(pop)
static bool ReadAndExtrapolate(int fd, size_t num_events, double& extrapolate,
double* HWY_RESTRICT values) {
Buf buf;
const ssize_t want_bytes = static_cast<ssize_t>(24 + num_events * sizeof(uint64_t));
if (HWY_UNLIKELY(!ReadBytes(fd, want_bytes, &buf))) return false;
HWY_DASSERT(num_events == buf.num_events);
HWY_DASSERT(buf.time_running <= buf.time_enabled);
if (HWY_UNLIKELY(buf.time_running == 0)) return false;
extrapolate = static_cast<double>(buf.time_enabled) /
static_cast<double>(buf.time_running);
for (size_t i = 0; i < buf.num_events; ++i) {
values[i] = static_cast<double>(buf.values[i]) * extrapolate;
}
return true;
}
public:
bool Init() {
if (HWY_UNLIKELY(!fds_.empty())) return true;
if (HWY_UNLIKELY(!PerfCountersSupported())) {
HWY_WARN(
"This Linux does not support perf counters. The program will"
"continue, but counters will return zero.");
return false;
}
groups_.push_back(Group());
fds_.reserve(PerfCounters::kCapacity);
for (const CounterConfig& config : AllCounterConfigs()) {
if (HWY_UNLIKELY(groups_.back().num_events == kMaxEventsPerGroup)) {
groups_.push_back(Group());
}
int fd = SysPerfEventOpen(config, groups_.back().leader_fd);
if (HWY_UNLIKELY(fd < 0)) {
fd = SysPerfEventOpen(config, -1);
if (fd >= 0 && groups_.back().num_events != 0) {
groups_.push_back(Group());
}
}
if (HWY_UNLIKELY(fd < 0)) {
HWY_WARN("perf_event_open %d errno %d for counter %s.", fd, errno,
PerfCounters::Name(config.c));
} else {
if (groups_.back().leader_fd == -1) {
groups_.back().leader_fd = fd;
if (HWY_UNLIKELY(config.type == PERF_TYPE_SOFTWARE)) {
HWY_WARN("SW event %s should not be leader.",
PerfCounters::Name(config.c));
}
}
PackedIdx(config.c) = fds_.size();
groups_.back().num_events += 1;
valid_.Set(static_cast<size_t>(config.c));
fds_.push_back(fd);
}
}
if (HWY_UNLIKELY(fds_.empty())) {
HWY_ASSERT(groups_.size() == 1);
HWY_ASSERT(groups_.back().num_events == 0);
HWY_ASSERT(groups_.back().leader_fd == -1);
groups_.clear();
}
size_t num_valid = 0;
for (const Group& group : groups_) {
num_valid += group.num_events;
HWY_ASSERT(group.leader_fd >= 0);
HWY_ASSERT(0 != group.num_events &&
group.num_events <= kMaxEventsPerGroup);
}
HWY_ASSERT(num_valid == fds_.size());
HWY_ASSERT(num_valid == valid_.Count());
HWY_ASSERT(num_valid <= PerfCounters::kCapacity);
if (num_valid) {
StopAllAndReset();
return true;
} else {
HWY_WARN("No valid counters found.");
return true;
}
}
bool StartAll() {
if (HWY_UNLIKELY(fds_.empty())) return false;
HWY_ASSERT(prctl(PR_TASK_PERF_EVENTS_ENABLE) == 0);
return true;
}
void StopAllAndReset() {
HWY_ASSERT(prctl(PR_TASK_PERF_EVENTS_DISABLE) == 0);
for (int fd : fds_) {
HWY_ASSERT(ioctl(fd, PERF_EVENT_IOC_RESET, 0) == 0);
}
}
bool Read(BitSet64& valid, double& max_extrapolate, double* values) {
if (HWY_UNLIKELY(!valid_.Any())) return false;
max_extrapolate = 1.0;
double* pos = values;
for (const Group& group : groups_) {
double extrapolate;
if (HWY_UNLIKELY(!ReadAndExtrapolate(group.leader_fd, group.num_events,
extrapolate, pos))) {
return false;
}
max_extrapolate = HWY_MAX(max_extrapolate, extrapolate);
pos += group.num_events;
}
valid = valid_;
HWY_DASSERT(pos == values + valid.Count());
return true;
}
private:
std::vector<int> fds_; BitSet64 valid_;
struct Group {
size_t num_events = 0;
int leader_fd = -1;
};
std::vector<Group> groups_;
};
PMU& GetPMU() {
static PMU pmu;
return pmu;
}
}
HWY_DLLEXPORT bool PerfCounters::Init() { return GetPMU().Init(); }
HWY_DLLEXPORT bool PerfCounters::StartAll() { return GetPMU().StartAll(); }
HWY_DLLEXPORT void PerfCounters::StopAllAndReset() {
GetPMU().StopAllAndReset();
}
HWY_DLLEXPORT PerfCounters::PerfCounters() {
if (HWY_UNLIKELY(!GetPMU().Read(valid_, max_extrapolate_, values_))) {
valid_ = BitSet64();
max_extrapolate_ = 0.0;
hwy::ZeroBytes(values_, sizeof(values_));
}
}
HWY_DLLEXPORT size_t PerfCounters::IndexForCounter(Counter c) {
return PackedIdx(c);
}
#else
HWY_DLLEXPORT bool PerfCounters::Init() { return false; }
HWY_DLLEXPORT bool PerfCounters::StartAll() { return false; }
HWY_DLLEXPORT void PerfCounters::StopAllAndReset() {}
HWY_DLLEXPORT PerfCounters::PerfCounters()
: max_extrapolate_(1.0), values_{0.0} {}
HWY_DLLEXPORT size_t PerfCounters::IndexForCounter(Counter) { return 0; }
#endif
} }