#include "hwy/contrib/thread_pool/topology.h"
#include <ctype.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <array>
#include <map>
#include <string>
#include <vector>
#include "hwy/base.h"
#if HWY_OS_APPLE
#include <sys/sysctl.h>
#include "hwy/aligned_allocator.h"
#endif
#if HWY_OS_WIN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#ifndef _WIN32_WINNT
#define _WIN32_WINNT 0x0601
#endif
#include <windows.h>
#endif
#if HWY_OS_LINUX || HWY_OS_FREEBSD
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <sched.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
#endif
#if HWY_OS_FREEBSD
#include <sys/param.h>
#include <sys/cpuset.h>
#endif
#if HWY_ARCH_WASM
#include <emscripten/threading.h>
#endif
namespace hwy {
HWY_CONTRIB_DLLEXPORT bool HaveThreadingSupport() {
#if HWY_ARCH_WASM
return emscripten_has_threading_support() != 0;
#else
return true;
#endif
}
namespace {
HWY_MAYBE_UNUSED size_t DivByFactor(size_t whole, size_t part) {
HWY_ASSERT(part != 0);
const size_t div = whole / part;
const size_t mul = div * part;
if (mul != whole) {
HWY_ABORT("%zu / %zu = %zu; *%zu = %zu\n", whole, part, div, part, mul);
}
return div;
}
#if HWY_OS_WIN
using SLPI = SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX;
template <class Func>
bool ForEachSLPI(LOGICAL_PROCESSOR_RELATIONSHIP rel, Func&& func) {
DWORD buf_bytes = 0;
HWY_ASSERT(!GetLogicalProcessorInformationEx(rel, nullptr, &buf_bytes));
if (HWY_UNLIKELY(buf_bytes == 0 && GetLastError() == ERROR_GEN_FAILURE)) {
if (rel != RelationNumaNodeEx && rel != RelationProcessorDie) {
HWY_WARN("Unexpected err %lx for GLPI relationship %d\n", GetLastError(),
static_cast<int>(rel));
}
return false;
}
HWY_ASSERT(GetLastError() == ERROR_INSUFFICIENT_BUFFER);
uint8_t* buf = static_cast<uint8_t*>(malloc(buf_bytes));
HWY_ASSERT(buf);
SLPI* info = reinterpret_cast<SLPI*>(buf);
if (HWY_UNLIKELY(!GetLogicalProcessorInformationEx(rel, info, &buf_bytes))) {
free(buf);
return false;
}
uint8_t* pos = buf;
while (pos < buf + buf_bytes) {
info = reinterpret_cast<SLPI*>(pos);
HWY_ASSERT(rel == RelationAll || info->Relationship == rel);
func(*info);
pos += info->Size;
}
if (pos != buf + buf_bytes) {
HWY_WARN("unexpected pos %p, end %p, buf_bytes %lu, sizeof(SLPI) %zu\n",
pos, buf + buf_bytes, buf_bytes, sizeof(SLPI));
}
free(buf);
return true;
}
size_t NumBits(size_t num_groups, const GROUP_AFFINITY* affinity) {
size_t total_bits = 0;
for (size_t i = 0; i < num_groups; ++i) {
size_t bits = 0;
hwy::CopyBytes<sizeof(bits)>(&affinity[i].Mask, &bits);
total_bits += hwy::PopCount(bits);
}
return total_bits;
}
template <class Func>
void ForeachBit(size_t num_groups, const GROUP_AFFINITY* affinity,
std::vector<Topology::LP>& lps, int line, const Func& func) {
for (size_t group = 0; group < num_groups; ++group) {
size_t bits = 0;
hwy::CopyBytes<sizeof(bits)>(&affinity[group].Mask, &bits);
while (bits != 0) {
size_t lp = Num0BitsBelowLS1Bit_Nonzero64(bits);
bits &= bits - 1; if (HWY_UNLIKELY(lp >= lps.size())) {
Warn(__FILE__, line, "Clamping lp %zu to lps.size() %zu, groups %zu\n",
lp, lps.size(), num_groups);
lp = lps.size() - 1;
}
func(lp, lps);
}
}
}
#elif HWY_OS_APPLE
template <typename T>
bool Sysctl(const char* name, size_t div, int& err, T* out) {
size_t val = 0;
size_t size = sizeof(val);
const int ret = sysctlbyname(name, &val, &size, nullptr, 0);
if (HWY_UNLIKELY(ret != 0)) {
err = ret;
return false;
}
*out = static_cast<T>(DivByFactor(val, div));
return true;
}
#endif
}
HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() {
size_t total_lps = 0;
#if HWY_ARCH_WASM
const int num_cores = emscripten_num_logical_cores();
if (num_cores > 0) total_lps = static_cast<size_t>(num_cores);
#elif HWY_OS_WIN
(void)ForEachSLPI(RelationProcessorCore, [&total_lps](const SLPI& info) {
const PROCESSOR_RELATIONSHIP& p = info.Processor;
total_lps += NumBits(p.GroupCount, p.GroupMask);
});
#elif HWY_OS_LINUX
const long ret = sysconf(_SC_NPROCESSORS_ONLN); if (ret < 0) {
HWY_WARN("Unexpected _SC_NPROCESSORS_CONF = %d\n", static_cast<int>(ret));
} else {
total_lps = static_cast<size_t>(ret);
}
#elif HWY_OS_APPLE
int err;
if (!Sysctl("hw.perflevel0.logicalcpu", 1, err, &total_lps)) {
total_lps = 0;
}
#endif
if (HWY_UNLIKELY(total_lps == 0)) { HWY_WARN(
"Unknown TotalLogicalProcessors, assuming 1. "
"HWY_OS_: WIN=%d LINUX=%d APPLE=%d;\n"
"HWY_ARCH_: WASM=%d X86=%d PPC=%d ARM=%d RISCV=%d S390X=%d\n",
HWY_OS_WIN, HWY_OS_LINUX, HWY_OS_APPLE, HWY_ARCH_WASM, HWY_ARCH_X86,
HWY_ARCH_PPC, HWY_ARCH_ARM, HWY_ARCH_RISCV, HWY_ARCH_S390X);
return 1;
}
if (HWY_UNLIKELY(total_lps > kMaxLogicalProcessors)) {
HWY_WARN("OS reports %zu processors but clamping to %zu\n", total_lps,
kMaxLogicalProcessors);
total_lps = kMaxLogicalProcessors;
}
return total_lps;
}
#if HWY_OS_LINUX || HWY_OS_FREEBSD
namespace {
#if HWY_OS_LINUX
using CpuSet = cpu_set_t;
#else
using CpuSet = cpuset_t;
#endif
int GetAffinity(CpuSet* set) {
#if defined(__ANDROID__) && __ANDROID_API__ < 12
return syscall(__NR_sched_getaffinity, 0, sizeof(CpuSet), set);
#elif HWY_OS_FREEBSD
return cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(CpuSet),
set);
#else
return sched_getaffinity(0, sizeof(CpuSet), set);
#endif
}
int SetAffinity(CpuSet* set) {
#if defined(__ANDROID__) && __ANDROID_API__ < 12
return syscall(__NR_sched_setaffinity, 0, sizeof(CpuSet), set);
#elif HWY_OS_FREEBSD
return cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(CpuSet),
set);
#else
return sched_setaffinity(0, sizeof(CpuSet), set);
#endif
}
bool IsSet(size_t lp, const CpuSet* set) {
#if HWY_COMPILER_GCC_ACTUAL
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion")
#endif
const int is_set = CPU_ISSET(static_cast<int>(lp), set);
#if HWY_COMPILER_GCC_ACTUAL
HWY_DIAGNOSTICS(pop)
#endif
return is_set != 0;
}
void Set(size_t lp, CpuSet* set) {
#if HWY_COMPILER_GCC_ACTUAL
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion")
#endif
CPU_SET(static_cast<int>(lp), set);
#if HWY_COMPILER_GCC_ACTUAL
HWY_DIAGNOSTICS(pop)
#endif
}
} #endif
HWY_CONTRIB_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps) {
#if HWY_OS_WIN
const HANDLE hThread = GetCurrentThread();
const DWORD_PTR prev = SetThreadAffinityMask(hThread, ~DWORD_PTR(0));
if (!prev) return false;
(void)SetThreadAffinityMask(hThread, prev);
lps = LogicalProcessorSet(); lps.SetNonzeroBitsFrom64(prev);
return true;
#elif HWY_OS_LINUX || HWY_OS_FREEBSD
CpuSet set;
CPU_ZERO(&set);
const int err = GetAffinity(&set);
if (err != 0) return false;
for (size_t lp = 0; lp < kMaxLogicalProcessors; ++lp) {
if (IsSet(lp, &set)) lps.Set(lp);
}
return true;
#else
(void)lps;
return false;
#endif
}
HWY_CONTRIB_DLLEXPORT bool SetThreadAffinity(const LogicalProcessorSet& lps) {
#if HWY_OS_WIN
const HANDLE hThread = GetCurrentThread();
const DWORD_PTR prev = SetThreadAffinityMask(hThread, lps.Get64());
return prev != 0;
#elif HWY_OS_LINUX || HWY_OS_FREEBSD
CpuSet set;
CPU_ZERO(&set);
lps.Foreach([&set](size_t lp) { Set(lp, &set); });
const int err = SetAffinity(&set);
if (err != 0) return false;
return true;
#else
(void)lps;
return false;
#endif
}
namespace {
struct PackageSizes {
size_t num_clusters;
size_t num_cores;
};
#if HWY_OS_LINUX
class File {
public:
explicit File(const char* path) {
for (;;) {
fd_ = open(path, O_RDONLY);
if (fd_ > 0) return; if (errno == EINTR) continue; if (errno == ENOENT) return; HWY_WARN("Unexpected error opening %s: %d\n", path, errno);
return; }
}
~File() {
if (fd_ > 0) {
for (;;) {
const int ret = close(fd_);
if (ret == 0) break; if (errno == EINTR) continue; HWY_WARN("Unexpected error closing file: %d\n", errno);
return; }
}
}
size_t Read(char* buf200) const {
if (fd_ < 0) return 0;
size_t pos = 0;
for (;;) {
const auto bytes_read = read(fd_, buf200 + pos, 200 - pos);
if (bytes_read == 0) { buf200[pos++] = '\0';
return pos;
}
if (bytes_read == -1) {
if (errno == EINTR) continue; HWY_WARN("Unexpected error reading file: %d\n", errno);
return 0;
}
pos += static_cast<size_t>(bytes_read);
HWY_ASSERT(pos <= 200);
}
}
private:
int fd_;
};
size_t ReadSysfs(const char* format, size_t lp, char* buf200) {
char path[200];
const int bytes_written = snprintf(path, sizeof(path), format, lp);
HWY_ASSERT(0 < bytes_written &&
bytes_written < static_cast<int>(sizeof(path) - 1));
const File file(path);
return file.Read(buf200);
}
bool ParseDigits(const char* str, const size_t end, size_t& pos, size_t* out) {
HWY_ASSERT(pos <= end);
const size_t stop = pos + 9;
*out = 0;
for (; pos < HWY_MIN(end, stop); ++pos) {
const int c = str[pos];
if (c < '0' || c > '9') break;
*out *= 10;
*out += static_cast<size_t>(c - '0');
}
if (pos == 0) { *out = 0;
return false;
}
return true;
}
bool ParseNumberWithOptionalSuffix(const char* str, size_t len, size_t* out) {
size_t pos = 0;
if (!ParseDigits(str, len, pos, out)) return false;
if (str[pos] == 'K') {
*out <<= 10;
++pos;
}
if (str[pos] == 'M') {
*out <<= 20;
++pos;
}
if (str[pos] != '\0' && str[pos] != '\n') {
HWY_ABORT("Expected [suffix] terminator at %zu %s\n", pos, str);
}
return true;
}
bool ReadNumberWithOptionalSuffix(const char* format, size_t lp, size_t* out) {
char buf200[200];
const size_t pos = ReadSysfs(format, lp, buf200);
if (pos == 0) return false;
return ParseNumberWithOptionalSuffix(buf200, pos, out);
}
const char* kPackage =
"/sys/devices/system/cpu/cpu%zu/topology/physical_package_id";
const char* kCluster = "/sys/devices/system/cpu/cpu%zu/cache/index3/id";
const char* kCore = "/sys/devices/system/cpu/cpu%zu/topology/core_id";
const char* kL2Size = "/sys/devices/system/cpu/cpu%zu/cache/index2/size";
const char* kL3Size = "/sys/devices/system/cpu/cpu%zu/cache/index3/size";
const char* kNode = "/sys/devices/system/node/node%zu/cpulist";
class Remapper {
public:
template <typename T>
bool operator()(const char* format, size_t lp, T* HWY_RESTRICT out_index) {
size_t opaque;
if (!ReadNumberWithOptionalSuffix(format, lp, &opaque)) return false;
const auto ib = indices_.insert({opaque, num_});
num_ += ib.second; const size_t index = ib.first->second; HWY_ASSERT(index < num_);
HWY_ASSERT(index < hwy::LimitsMax<T>());
*out_index = static_cast<T>(index);
return true;
}
size_t Num() const { return num_; }
private:
std::map<size_t, size_t> indices_;
size_t num_ = 0;
};
struct PerPackage {
Remapper clusters;
Remapper cores;
uint8_t smt_per_core[kMaxLogicalProcessors] = {0};
};
std::vector<PackageSizes> DetectPackages(std::vector<Topology::LP>& lps) {
std::vector<PackageSizes> empty;
Remapper packages;
for (size_t lp = 0; lp < lps.size(); ++lp) {
if (!packages(kPackage, lp, &lps[lp].package)) {
HWY_WARN("Failed to read sysfs package for LP %zu\n", lp);
return empty;
}
}
std::vector<PerPackage> per_package(packages.Num());
HWY_ASSERT(!per_package.empty());
for (size_t lp = 0; lp < lps.size(); ++lp) {
PerPackage& pp = per_package[lps[lp].package];
if (!pp.clusters(kCluster, lp, &lps[lp].cluster)) {
lps[lp].cluster = 0;
}
if (!pp.cores(kCore, lp, &lps[lp].core)) {
HWY_WARN("Failed to read sysfs core for LP %zu\n", lp);
return empty;
}
HWY_ASSERT(lps[lp].core < kMaxLogicalProcessors);
lps[lp].smt = pp.smt_per_core[lps[lp].core]++;
HWY_ASSERT(lps[lp].smt < 16);
}
std::vector<PackageSizes> package_sizes(per_package.size());
for (size_t p = 0; p < package_sizes.size(); ++p) {
package_sizes[p].num_clusters = HWY_MAX(1, per_package[p].clusters.Num());
package_sizes[p].num_cores = per_package[p].cores.Num();
HWY_ASSERT(package_sizes[p].num_cores != 0);
}
return package_sizes;
}
std::vector<size_t> ExpandList(const char* list, size_t list_end,
size_t max_lp) {
std::vector<size_t> expanded;
constexpr size_t kNotFound = ~size_t{0};
size_t pos = 0;
if (isspace(list[0]) && list_end <= 2) return expanded;
const auto find = [list, list_end, &pos](char c) -> size_t {
const char* found_ptr = strchr(list + pos, c);
if (found_ptr == nullptr) return kNotFound;
const size_t found_pos = static_cast<size_t>(found_ptr - list);
HWY_ASSERT(found_pos < list_end && list[found_pos] == c);
return found_pos;
};
const auto parse_lp = [list, list_end, &pos, max_lp](size_t end) -> size_t {
end = HWY_MIN(end, list_end);
size_t lp;
HWY_ASSERT(ParseDigits(list, end, pos, &lp));
HWY_IF_CONSTEXPR(HWY_ARCH_RISCV) {
lp = HWY_MIN(lp, max_lp);
}
HWY_ASSERT(lp <= max_lp);
HWY_ASSERT(pos <= end);
return lp;
};
for (;;) {
const size_t lp_range_first = parse_lp(HWY_MIN(find('-'), find(',')));
if (list[pos] == '-') { ++pos; const size_t lp_range_last = parse_lp(find(','));
expanded.reserve(expanded.size() + lp_range_last - lp_range_first + 1);
for (size_t lp = lp_range_first; lp <= lp_range_last; ++lp) {
expanded.push_back(lp);
}
} else { expanded.push_back(lp_range_first);
}
if (pos == list_end || list[pos] == '\0' || list[pos] == '\n') {
break;
}
if (list[pos] == ',') {
++pos;
continue;
}
HWY_ABORT("Unexpected character at %zu in %s\n", pos, list);
}
return expanded;
}
void SetNodes(std::vector<Topology::LP>& lps) {
for (size_t node = 0;; node++) {
char buf200[200];
const size_t bytes_read = ReadSysfs(kNode, node, buf200);
if (bytes_read == 0) break;
const std::vector<size_t> list =
ExpandList(buf200, bytes_read, lps.size() - 1);
for (size_t lp : list) {
lps[lp].node = static_cast<uint8_t>(node);
}
}
}
void SetClusterCacheSizes(std::vector<Topology::Package>& packages) {
for (size_t ip = 0; ip < packages.size(); ++ip) {
Topology::Package& p = packages[ip];
for (size_t ic = 0; ic < p.clusters.size(); ++ic) {
Topology::Cluster& c = p.clusters[ic];
const size_t lp = c.lps.First();
size_t bytes;
if (ReadNumberWithOptionalSuffix(kL2Size, lp, &bytes)) {
c.private_kib = bytes >> 10;
}
if (ReadNumberWithOptionalSuffix(kL3Size, lp, &bytes)) {
c.shared_kib = bytes >> 10;
}
}
}
}
#elif HWY_OS_WIN
size_t MaxLpsPerCore(std::vector<Topology::LP>& lps) {
size_t max_lps_per_core = 0;
size_t core_idx = 0;
(void)ForEachSLPI(RelationProcessorCore, [&max_lps_per_core, &core_idx,
&lps](const SLPI& info) {
const PROCESSOR_RELATIONSHIP& p = info.Processor;
const size_t lps_per_core = NumBits(p.GroupCount, p.GroupMask);
max_lps_per_core = HWY_MAX(max_lps_per_core, lps_per_core);
size_t smt = 0;
ForeachBit(p.GroupCount, p.GroupMask, lps, __LINE__,
[core_idx, &smt](size_t lp, std::vector<Topology::LP>& lps) {
lps[lp].core = static_cast<uint16_t>(core_idx);
lps[lp].smt = static_cast<uint8_t>(smt++);
});
++core_idx;
});
HWY_ASSERT(max_lps_per_core != 0);
return max_lps_per_core;
}
size_t MaxCoresPerCluster(const size_t max_lps_per_core,
std::vector<Topology::LP>& lps) {
size_t max_cores_per_cluster = 0;
size_t cluster_idx = 0;
const auto foreach_cluster = [&](size_t num_groups,
const GROUP_AFFINITY* groups) {
const size_t lps_per_cluster = NumBits(num_groups, groups);
const size_t cores_per_cluster = DivCeil(lps_per_cluster, max_lps_per_core);
max_cores_per_cluster = HWY_MAX(max_cores_per_cluster, cores_per_cluster);
ForeachBit(num_groups, groups, lps, __LINE__,
[cluster_idx](size_t lp, std::vector<Topology::LP>& lps) {
lps[lp].cluster = static_cast<uint16_t>(cluster_idx);
});
++cluster_idx;
};
const auto foreach_die = [&foreach_cluster](const SLPI& info) {
const PROCESSOR_RELATIONSHIP& p = info.Processor;
foreach_cluster(p.GroupCount, p.GroupMask);
};
const auto foreach_l3 = [&foreach_cluster](const SLPI& info) {
const CACHE_RELATIONSHIP& cr = info.Cache;
if (cr.Type != CacheUnified && cr.Type != CacheData) return;
if (cr.Level != 3) return;
foreach_cluster(cr.GroupCount, cr.GroupMasks);
};
if (!ForEachSLPI(RelationProcessorDie, foreach_die)) {
(void)ForEachSLPI(RelationCache, foreach_l3);
}
if (max_cores_per_cluster == 0) {
HWY_WARN("All clusters empty, assuming 1 core each\n");
max_cores_per_cluster = 1;
}
return max_cores_per_cluster;
}
std::vector<PackageSizes> DetectPackages(std::vector<Topology::LP>& lps) {
const size_t max_lps_per_core = MaxLpsPerCore(lps);
const size_t max_cores_per_cluster =
MaxCoresPerCluster(max_lps_per_core, lps);
std::vector<PackageSizes> packages;
size_t package_idx = 0;
(void)ForEachSLPI(RelationProcessorPackage, [&](const SLPI& info) {
const PROCESSOR_RELATIONSHIP& p = info.Processor;
const size_t lps_per_package = NumBits(p.GroupCount, p.GroupMask);
PackageSizes ps; ps.num_clusters = max_cores_per_cluster;
ps.num_cores = DivCeil(lps_per_package, max_lps_per_core);
packages.push_back(ps);
ForeachBit(p.GroupCount, p.GroupMask, lps, __LINE__,
[package_idx](size_t lp, std::vector<Topology::LP>& lps) {
lps[lp].package = static_cast<uint8_t>(package_idx);
});
++package_idx;
});
return packages;
}
void SetNodes(std::vector<Topology::LP>& lps) {
for (size_t lp = 0; lp < lps.size(); ++lp) {
lps[lp].node = 0;
}
(void)ForEachSLPI(RelationAll, [&](const SLPI& info) {
if (info.Relationship != RelationNumaNode) return;
const NUMA_NODE_RELATIONSHIP& nn = info.NumaNode;
const size_t num_groups = HWY_MAX(1, nn.GroupCount);
const uint8_t node = static_cast<uint8_t>(nn.NodeNumber);
ForeachBit(num_groups, nn.GroupMasks, lps, __LINE__,
[node](size_t lp, std::vector<Topology::LP>& lps) {
lps[lp].node = node;
});
});
}
#elif HWY_OS_APPLE
std::vector<PackageSizes> DetectPackages(std::vector<Topology::LP>& lps) {
int err;
size_t total_cores = 0;
if (!Sysctl("hw.perflevel0.physicalcpu", 1, err, &total_cores)) {
HWY_WARN("Error %d detecting total_cores, assuming one per LP\n", err);
total_cores = lps.size();
}
if (lps.size() % total_cores != 0) {
HWY_WARN("LPs %zu not a multiple of total_cores %zu\n", lps.size(),
total_cores);
}
const size_t lp_per_core = DivCeil(lps.size(), total_cores);
size_t cores_per_cluster = 0;
if (!Sysctl("hw.perflevel0.cpusperl2", 1, err, &cores_per_cluster)) {
HWY_WARN("Error %d detecting cores_per_cluster\n", err);
cores_per_cluster = HWY_MIN(4, total_cores);
}
if (total_cores % cores_per_cluster != 0) {
HWY_WARN("total_cores %zu not a multiple of cores_per_cluster %zu\n",
total_cores, cores_per_cluster);
}
for (size_t lp = 0; lp < lps.size(); ++lp) {
lps[lp].package = 0; lps[lp].core = static_cast<uint16_t>(lp / lp_per_core);
lps[lp].smt = static_cast<uint8_t>(lp % lp_per_core);
lps[lp].cluster = static_cast<uint16_t>(lps[lp].core / cores_per_cluster);
}
PackageSizes ps;
ps.num_clusters = DivCeil(total_cores, cores_per_cluster);
ps.num_cores = total_cores;
return std::vector<PackageSizes>{ps};
}
void SetNodes(std::vector<Topology::LP>& lps) {
for (size_t lp = 0; lp < lps.size(); ++lp) {
lps[lp].node = 0; }
}
#endif
#if HWY_OS_WIN || HWY_OS_APPLE
void SetClusterCacheSizes(std::vector<Topology::Package>& packages) {
const Cache* caches = DataCaches();
const size_t private_kib = caches ? caches[2].size_kib : 0;
const size_t shared_kib = caches ? caches[3].size_kib : 0;
for (size_t ip = 0; ip < packages.size(); ++ip) {
Topology::Package& p = packages[ip];
for (size_t ic = 0; ic < p.clusters.size(); ++ic) {
Topology::Cluster& c = p.clusters[ic];
c.private_kib = private_kib;
c.shared_kib = shared_kib;
}
}
}
#endif
}
HWY_CONTRIB_DLLEXPORT Topology::Topology() {
#if HWY_OS_LINUX || HWY_OS_WIN || HWY_OS_APPLE
lps.resize(TotalLogicalProcessors());
const std::vector<PackageSizes>& package_sizes = DetectPackages(lps);
if (package_sizes.empty()) return;
SetNodes(lps);
packages.resize(package_sizes.size());
for (size_t p = 0; p < packages.size(); ++p) {
packages[p].clusters.resize(package_sizes[p].num_clusters);
packages[p].cores.resize(package_sizes[p].num_cores);
}
for (size_t lp = 0; lp < lps.size(); ++lp) {
Package& p = packages[lps[lp].package];
p.clusters[lps[lp].cluster].lps.Set(lp);
p.cores[lps[lp].core].lps.Set(lp);
}
SetClusterCacheSizes(packages);
#endif }
namespace {
using Caches = std::array<Cache, 4>;
#if HWY_OS_LINUX
std::string ReadString(const char* name, size_t index) {
const std::string path("/sys/devices/system/cpu/cpu0/cache/index%zu/");
char buf200[200];
size_t end = ReadSysfs((path + name).c_str(), index, buf200);
for (; end != 0; --end) {
if (buf200[end - 1] != '\0' && buf200[end - 1] != '\n') break;
}
return std::string(buf200, buf200 + end);
}
template <typename T>
bool WriteSysfs(const char* name, size_t index, T* out) {
const std::string str = ReadString(name, index);
size_t pos = 0;
size_t val;
if (!ParseDigits(str.c_str(), str.length(), pos, &val)) return false;
HWY_ASSERT(pos <= str.length());
*out = static_cast<T>(val);
return true;
}
bool InitCachesSysfs(Caches& caches) {
std::vector<hwy::Topology::LP> lps(TotalLogicalProcessors());
const std::vector<PackageSizes> package_sizes = DetectPackages(lps);
if (package_sizes.empty()) {
HWY_WARN("no packages, shared cache sizes may be incorrect\n");
return false;
}
for (size_t i = 0;; ++i) {
const std::string type = ReadString("type", i);
if (type.empty()) break; if (type != "Data" && type != "Unified") continue;
uint32_t level;
if (!WriteSysfs("level", i, &level)) continue;
if (level != 1 && level != 2 && level != 3) continue;
Cache& c = caches[level];
if (c.size_kib != 0) {
HWY_WARN("ignoring another L%u, first size %u\n", level, c.size_kib);
continue;
}
const bool ok = WriteSysfs("size", i, &c.size_kib) &&
WriteSysfs("ways_of_associativity", i, &c.associativity) &&
WriteSysfs("number_of_sets", i, &c.sets);
if (HWY_UNLIKELY(!ok)) {
HWY_WARN("skipping partially-detected L%u, error %d\n", level, errno);
c = Cache();
continue;
}
const size_t bytes = static_cast<size_t>(c.size_kib) * 1024;
const size_t lines = c.associativity * c.sets;
c.bytes_per_line = static_cast<uint16_t>(DivByFactor(bytes, lines));
const std::string shared_str = ReadString("shared_cpu_list", i);
if (HWY_UNLIKELY(shared_str.empty())) {
HWY_WARN("no shared_cpu_list for L%u %s\n", level, type.c_str());
c.cores_sharing = 1;
} else {
const std::vector<size_t> shared_lps =
ExpandList(shared_str.c_str(), shared_str.length(), lps.size() - 1);
size_t num_cores = 0;
for (size_t lp : shared_lps) {
if (HWY_LIKELY(lp < lps.size())) {
num_cores += lps[lp].smt == 0;
} else {
HWY_WARN("out of bounds lp %zu of %zu from %s\n", lp, lps.size(),
shared_str.c_str());
}
}
if (num_cores == 0) {
HWY_WARN("no cores sharing L%u %s, setting to 1\n", level,
type.c_str());
num_cores = 1;
}
c.cores_sharing = static_cast<uint16_t>(num_cores);
c.size_kib = static_cast<uint32_t>(c.size_kib / num_cores);
c.sets = static_cast<uint32_t>(c.sets / num_cores);
}
}
if (HWY_UNLIKELY(caches[1].size_kib == 0 || caches[2].size_kib == 0)) {
#ifndef __ANDROID__
HWY_WARN("sysfs detected L1=%u L2=%u, err %x\n", caches[1].size_kib,
caches[2].size_kib, errno);
#endif
return false;
}
return true;
}
#elif HWY_OS_WIN
bool InitCachesWin(Caches& caches) {
std::vector<hwy::Topology::LP> lps(TotalLogicalProcessors());
const size_t max_lps_per_core = MaxLpsPerCore(lps);
(void)ForEachSLPI(RelationCache, [max_lps_per_core,
&caches](const SLPI& info) {
const CACHE_RELATIONSHIP& cr = info.Cache;
if (cr.Type != CacheUnified && cr.Type != CacheData) return;
if (1 <= cr.Level && cr.Level <= 3) {
Cache& c = caches[cr.Level];
if (c.size_kib > 0) return;
c.size_kib = static_cast<uint32_t>(DivByFactor(cr.CacheSize, 1024));
c.bytes_per_line = static_cast<uint16_t>(cr.LineSize);
c.associativity = (cr.Associativity == CACHE_FULLY_ASSOCIATIVE)
? Cache::kMaxAssociativity
: cr.Associativity;
size_t shared_with = NumBits(cr.GroupCount, cr.GroupMasks);
shared_with = DivCeil(shared_with, max_lps_per_core);
if (shared_with == 0) {
HWY_WARN("no cores sharing L%u, setting to 1\n", cr.Level);
shared_with = 1;
}
c.size_kib = static_cast<uint32_t>(c.size_kib / shared_with);
c.cores_sharing = static_cast<uint16_t>(shared_with);
}
});
if (HWY_UNLIKELY(caches[1].size_kib == 0 || caches[2].size_kib == 0)) {
HWY_WARN("Windows detected L1=%u, L2=%u, err %lx\n", caches[1].size_kib,
caches[2].size_kib, GetLastError());
return false;
}
return true;
}
#elif HWY_OS_APPLE
bool InitCachesApple(Caches& caches) {
int err = 0;
Cache& L1 = caches[1];
Cache& L2 = caches[2];
Cache& L3 = caches[3];
bool ok = Sysctl("hw.perflevel0.l1dcachesize", 1024, err, &L1.size_kib) ||
Sysctl("hw.l1dcachesize", 1024, err, &L1.size_kib);
ok &= Sysctl("hw.perflevel0.l2cachesize", 1024, err, &L2.size_kib) ||
Sysctl("hw.l2cachesize", 1024, err, &L2.size_kib);
if (HWY_UNLIKELY(!ok)) {
HWY_WARN("Apple cache detection failed, error %d\n", err);
return false;
}
L1.cores_sharing = 1;
if (Sysctl("hw.perflevel0.cpusperl2", 1, err, &L2.cores_sharing)) {
L2.size_kib /= L2.cores_sharing;
} else {
L2.cores_sharing = 1;
}
char brand[128] = {0};
size_t size = sizeof(brand);
if (!sysctlbyname("machdep.cpu.brand_string", brand, &size, nullptr, 0)) {
if (strncmp(brand, "Apple ", 6) != 0) {
HWY_WARN("unexpected Apple brand %s\n", brand);
}
if (brand[6] == 'M') {
L1.bytes_per_line = 64;
L1.associativity = 8;
L2.bytes_per_line = 128;
if (brand[7] == '1') { L2.associativity = 12;
} else if ('2' <= brand[7] && brand[7] <= '4') { L2.associativity = 16;
} else {
L2.associativity = 0; }
} }
uint16_t bytes_per_line;
if (!Sysctl("hw.cachelinesize", 1, err, &bytes_per_line)) {
bytes_per_line = static_cast<uint16_t>(HWY_ALIGNMENT); }
for (size_t level = 1; level <= 3; ++level) {
if (caches[level].bytes_per_line == 0) {
caches[level].bytes_per_line = bytes_per_line;
}
}
if (L1.associativity == 0 && !Sysctl("machdep.cpu.cache.L1_associativity", 1,
err, &L1.associativity)) {
L1.associativity = 8; }
if (L2.associativity == 0 && !Sysctl("machdep.cpu.cache.L2_associativity", 1,
err, &L2.associativity)) {
L2.associativity = 12; }
if (L3.associativity == 0) {
L3.associativity = 12; }
if (L3.size_kib == 0 &&
(Sysctl("hw.perflevel0.l3cachesize", 1024, err, &L3.size_kib) ||
Sysctl("hw.l3cachesize", 1024, err, &L3.size_kib))) {
if (Sysctl("hw.perflevel0.cpusperl3", 1, err, &L3.cores_sharing)) {
L3.size_kib /= L3.cores_sharing;
} else {
L3.cores_sharing = 1;
}
}
if (L3.size_kib == 0) {
L3 = Cache();
}
return true;
}
#endif
HWY_MAYBE_UNUSED void ComputeSets(Cache& c) {
if (HWY_UNLIKELY(c.size_kib == 0)) {
c.sets = 0;
return;
}
const size_t bytes = static_cast<size_t>(c.size_kib) * 1024;
const size_t lines = bytes / c.bytes_per_line;
const size_t sets = lines / c.associativity;
if (c.sets == 0) {
c.sets = static_cast<uint32_t>(sets);
} else {
const size_t diff = c.sets - sets;
if (diff > 1) {
HWY_ABORT("Inconsistent cache sets %u != %zu\n", c.sets, sets);
}
}
}
const Cache* InitDataCaches() {
alignas(64) static Caches caches;
#if HWY_OS_LINUX
if (HWY_UNLIKELY(!InitCachesSysfs(caches))) return nullptr;
#elif HWY_OS_WIN
if (HWY_UNLIKELY(!InitCachesWin(caches))) return nullptr;
#elif HWY_OS_APPLE
if (HWY_UNLIKELY(!InitCachesApple(caches))) return nullptr;
#else
HWY_WARN("Cache detection not implemented for this platform.\n");
(void)caches;
return nullptr;
#define HWY_NO_CACHE_DETECTION
#endif
#ifndef HWY_NO_CACHE_DETECTION
for (size_t level = 1; level <= 3; ++level) {
ComputeSets(caches[level]);
}
if (caches[3].cores_sharing >= 16 && caches[3].size_kib <= 512) {
caches[3] = Cache();
}
return &caches[0];
#endif }
}
HWY_CONTRIB_DLLEXPORT const Cache* DataCaches() {
static const Cache* caches = InitDataCaches();
return caches;
}
}