v8 147.4.0

Rust bindings to V8
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
// Copyright 2024 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "hwy/contrib/thread_pool/topology.h"

#include <ctype.h>  // isspace
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>  // strchr

#include <array>
#include <map>
#include <string>
#include <vector>

#include "hwy/base.h"  // HWY_OS_WIN, HWY_WARN

#if HWY_OS_APPLE
#include <sys/sysctl.h>

#include "hwy/aligned_allocator.h"  // HWY_ALIGNMENT
#endif

#if HWY_OS_WIN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#ifndef _WIN32_WINNT
#define _WIN32_WINNT 0x0601  // Windows 7 / Server 2008
#endif
#include <windows.h>
#endif  // HWY_OS_WIN

#if HWY_OS_LINUX || HWY_OS_FREEBSD
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <sched.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>  // sysconf
#endif  // HWY_OS_LINUX || HWY_OS_FREEBSD

#if HWY_OS_FREEBSD
#include <sys/param.h>
// After param.h / types.h.
#include <sys/cpuset.h>
#endif  // HWY_OS_FREEBSD

#if HWY_ARCH_WASM
#include <emscripten/threading.h>
#endif

namespace hwy {

HWY_CONTRIB_DLLEXPORT bool HaveThreadingSupport() {
#if HWY_ARCH_WASM
  return emscripten_has_threading_support() != 0;
#else
  return true;
#endif
}

namespace {

// Returns `whole / part`, with a check that `part` evenly divides `whole`,
// which implies the result is exact.
HWY_MAYBE_UNUSED size_t DivByFactor(size_t whole, size_t part) {
  HWY_ASSERT(part != 0);
  const size_t div = whole / part;
  const size_t mul = div * part;
  if (mul != whole) {
    HWY_ABORT("%zu / %zu = %zu; *%zu = %zu\n", whole, part, div, part, mul);
  }
  return div;
}

#if HWY_OS_WIN

using SLPI = SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX;

template <class Func>
bool ForEachSLPI(LOGICAL_PROCESSOR_RELATIONSHIP rel, Func&& func) {
  // Get required buffer size.
  DWORD buf_bytes = 0;
  HWY_ASSERT(!GetLogicalProcessorInformationEx(rel, nullptr, &buf_bytes));
  // Observed when `rel` is not supported:
  if (HWY_UNLIKELY(buf_bytes == 0 && GetLastError() == ERROR_GEN_FAILURE)) {
    if (rel != RelationNumaNodeEx && rel != RelationProcessorDie) {
      HWY_WARN("Unexpected err %lx for GLPI relationship %d\n", GetLastError(),
               static_cast<int>(rel));
    }
    return false;
  }
  HWY_ASSERT(GetLastError() == ERROR_INSUFFICIENT_BUFFER);
  // Note: `buf_bytes` may be less than `sizeof(SLPI)`, which has padding.
  uint8_t* buf = static_cast<uint8_t*>(malloc(buf_bytes));
  HWY_ASSERT(buf);

  // Fill the buffer.
  SLPI* info = reinterpret_cast<SLPI*>(buf);
  if (HWY_UNLIKELY(!GetLogicalProcessorInformationEx(rel, info, &buf_bytes))) {
    free(buf);
    return false;
  }

  // Iterate over each SLPI. `sizeof(SLPI)` is unreliable, see above.
  uint8_t* pos = buf;
  while (pos < buf + buf_bytes) {
    info = reinterpret_cast<SLPI*>(pos);
    HWY_ASSERT(rel == RelationAll || info->Relationship == rel);
    func(*info);
    pos += info->Size;
  }
  if (pos != buf + buf_bytes) {
    HWY_WARN("unexpected pos %p, end %p, buf_bytes %lu, sizeof(SLPI) %zu\n",
             pos, buf + buf_bytes, buf_bytes, sizeof(SLPI));
  }

  free(buf);
  return true;
}

size_t NumBits(size_t num_groups, const GROUP_AFFINITY* affinity) {
  size_t total_bits = 0;
  for (size_t i = 0; i < num_groups; ++i) {
    size_t bits = 0;
    hwy::CopyBytes<sizeof(bits)>(&affinity[i].Mask, &bits);
    total_bits += hwy::PopCount(bits);
  }
  return total_bits;
}

// Calls `func(lp, lps)` for each index `lp` in the set, after ensuring that
// `lp < lps.size()`. `line` is for debugging via Warn().
template <class Func>
void ForeachBit(size_t num_groups, const GROUP_AFFINITY* affinity,
                std::vector<Topology::LP>& lps, int line, const Func& func) {
  for (size_t group = 0; group < num_groups; ++group) {
    size_t bits = 0;
    hwy::CopyBytes<sizeof(bits)>(&affinity[group].Mask, &bits);
    while (bits != 0) {
      size_t lp = Num0BitsBelowLS1Bit_Nonzero64(bits);
      bits &= bits - 1;  // clear LSB
      if (HWY_UNLIKELY(lp >= lps.size())) {
        Warn(__FILE__, line, "Clamping lp %zu to lps.size() %zu, groups %zu\n",
             lp, lps.size(), num_groups);
        lp = lps.size() - 1;
      }
      func(lp, lps);
    }
  }
}

#elif HWY_OS_APPLE

// Returns whether sysctlbyname() succeeded; if so, writes `val / div` to
// `out`, otherwise sets `err`.
template <typename T>
bool Sysctl(const char* name, size_t div, int& err, T* out) {
  size_t val = 0;
  size_t size = sizeof(val);
  // Last two arguments are for updating the value, which we do not want.
  const int ret = sysctlbyname(name, &val, &size, nullptr, 0);
  if (HWY_UNLIKELY(ret != 0)) {
    // Do not print warnings because some `name` are expected to fail.
    err = ret;
    return false;
  }
  *out = static_cast<T>(DivByFactor(val, div));
  return true;
}

#endif  // HWY_OS_*

}  // namespace

HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() {
  size_t total_lps = 0;
#if HWY_ARCH_WASM
  const int num_cores = emscripten_num_logical_cores();
  if (num_cores > 0) total_lps = static_cast<size_t>(num_cores);
#elif HWY_OS_WIN
  // If there are multiple groups, this should return them all, rather than
  // just the first 64, but VMs report less.
  (void)ForEachSLPI(RelationProcessorCore, [&total_lps](const SLPI& info) {
    const PROCESSOR_RELATIONSHIP& p = info.Processor;
    total_lps += NumBits(p.GroupCount, p.GroupMask);
  });
#elif HWY_OS_LINUX
  // Only check "online" because sysfs entries such as topology are missing for
  // offline CPUs, which will cause `DetectPackages` to fail.
  const long ret = sysconf(_SC_NPROCESSORS_ONLN);  // NOLINT(runtime/int)
  if (ret < 0) {
    HWY_WARN("Unexpected _SC_NPROCESSORS_CONF = %d\n", static_cast<int>(ret));
  } else {
    total_lps = static_cast<size_t>(ret);
  }
#elif HWY_OS_APPLE
  int err;
  // Only report P processors.
  if (!Sysctl("hw.perflevel0.logicalcpu", 1, err, &total_lps)) {
    total_lps = 0;
  }
#endif

  if (HWY_UNLIKELY(total_lps == 0)) {  // Failed to detect.
    HWY_WARN(
        "Unknown TotalLogicalProcessors, assuming 1. "
        "HWY_OS_: WIN=%d LINUX=%d APPLE=%d;\n"
        "HWY_ARCH_: WASM=%d X86=%d PPC=%d ARM=%d RISCV=%d S390X=%d\n",
        HWY_OS_WIN, HWY_OS_LINUX, HWY_OS_APPLE, HWY_ARCH_WASM, HWY_ARCH_X86,
        HWY_ARCH_PPC, HWY_ARCH_ARM, HWY_ARCH_RISCV, HWY_ARCH_S390X);
    return 1;
  }

  // Warn that we are clamping.
  if (HWY_UNLIKELY(total_lps > kMaxLogicalProcessors)) {
    HWY_WARN("OS reports %zu processors but clamping to %zu\n", total_lps,
             kMaxLogicalProcessors);
    total_lps = kMaxLogicalProcessors;
  }

  return total_lps;
}

// ------------------------------ Affinity

#if HWY_OS_LINUX || HWY_OS_FREEBSD
namespace {

#if HWY_OS_LINUX
using CpuSet = cpu_set_t;
#else
using CpuSet = cpuset_t;
#endif

// Helper functions reduce the number of #if in GetThreadAffinity.
int GetAffinity(CpuSet* set) {
  // To specify the current thread, pass 0 on Linux/Android and -1 on FreeBSD.
#if defined(__ANDROID__) && __ANDROID_API__ < 12
  return syscall(__NR_sched_getaffinity, 0, sizeof(CpuSet), set);
#elif HWY_OS_FREEBSD
  return cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(CpuSet),
                            set);
#else  // normal Linux
  return sched_getaffinity(0, sizeof(CpuSet), set);
#endif
}

int SetAffinity(CpuSet* set) {
  // To specify the current thread, pass 0 on Linux/Android and -1 on FreeBSD.
#if defined(__ANDROID__) && __ANDROID_API__ < 12
  return syscall(__NR_sched_setaffinity, 0, sizeof(CpuSet), set);
#elif HWY_OS_FREEBSD
  return cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(CpuSet),
                            set);
#else  // normal Linux
  return sched_setaffinity(0, sizeof(CpuSet), set);
#endif
}

bool IsSet(size_t lp, const CpuSet* set) {
#if HWY_COMPILER_GCC_ACTUAL
  // Workaround for GCC compiler warning with CPU_ISSET macro
  HWY_DIAGNOSTICS(push)
  HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion")
#endif
  const int is_set = CPU_ISSET(static_cast<int>(lp), set);
#if HWY_COMPILER_GCC_ACTUAL
  HWY_DIAGNOSTICS(pop)
#endif
  return is_set != 0;
}

void Set(size_t lp, CpuSet* set) {
#if HWY_COMPILER_GCC_ACTUAL
  // Workaround for GCC compiler warning with CPU_SET macro
  HWY_DIAGNOSTICS(push)
  HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion")
#endif
  CPU_SET(static_cast<int>(lp), set);
#if HWY_COMPILER_GCC_ACTUAL
  HWY_DIAGNOSTICS(pop)
#endif
}

}  // namespace
#endif  // HWY_OS_LINUX || HWY_OS_FREEBSD

HWY_CONTRIB_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps) {
#if HWY_OS_WIN
  // Only support the first 64 because WINE does not support processor groups.
  const HANDLE hThread = GetCurrentThread();
  const DWORD_PTR prev = SetThreadAffinityMask(hThread, ~DWORD_PTR(0));
  if (!prev) return false;
  (void)SetThreadAffinityMask(hThread, prev);
  lps = LogicalProcessorSet();  // clear all
  lps.SetNonzeroBitsFrom64(prev);
  return true;
#elif HWY_OS_LINUX || HWY_OS_FREEBSD
  CpuSet set;
  CPU_ZERO(&set);
  const int err = GetAffinity(&set);
  if (err != 0) return false;
  for (size_t lp = 0; lp < kMaxLogicalProcessors; ++lp) {
    if (IsSet(lp, &set)) lps.Set(lp);
  }
  return true;
#else
  // For HWY_OS_APPLE, affinity is not supported. Do not even set lp=0 to force
  // callers to handle this case.
  (void)lps;
  return false;
#endif
}

HWY_CONTRIB_DLLEXPORT bool SetThreadAffinity(const LogicalProcessorSet& lps) {
#if HWY_OS_WIN
  const HANDLE hThread = GetCurrentThread();
  const DWORD_PTR prev = SetThreadAffinityMask(hThread, lps.Get64());
  return prev != 0;
#elif HWY_OS_LINUX || HWY_OS_FREEBSD
  CpuSet set;
  CPU_ZERO(&set);
  lps.Foreach([&set](size_t lp) { Set(lp, &set); });
  const int err = SetAffinity(&set);
  if (err != 0) return false;
  return true;
#else
  // Apple THREAD_AFFINITY_POLICY is only an (often ignored) hint.
  (void)lps;
  return false;
#endif
}

namespace {

struct PackageSizes {
  size_t num_clusters;
  size_t num_cores;
};

#if HWY_OS_LINUX

class File {
 public:
  explicit File(const char* path) {
    for (;;) {
      fd_ = open(path, O_RDONLY);
      if (fd_ > 0) return;           // success
      if (errno == EINTR) continue;  // signal: retry
      if (errno == ENOENT) return;   // not found, give up
      HWY_WARN("Unexpected error opening %s: %d\n", path, errno);
      return;  // unknown error, give up
    }
  }

  ~File() {
    if (fd_ > 0) {
      for (;;) {
        const int ret = close(fd_);
        if (ret == 0) break;           // success
        if (errno == EINTR) continue;  // signal: retry
        HWY_WARN("Unexpected error closing file: %d\n", errno);
        return;  // unknown error, ignore
      }
    }
  }

  // Returns number of bytes read or 0 on failure.
  size_t Read(char* buf200) const {
    if (fd_ < 0) return 0;
    size_t pos = 0;
    for (;;) {
      // read instead of `pread`, which might not work for sysfs.
      const auto bytes_read = read(fd_, buf200 + pos, 200 - pos);
      if (bytes_read == 0) {  // EOF: done
        buf200[pos++] = '\0';
        return pos;
      }
      if (bytes_read == -1) {
        if (errno == EINTR) continue;  // signal: retry
        HWY_WARN("Unexpected error reading file: %d\n", errno);
        return 0;
      }
      pos += static_cast<size_t>(bytes_read);
      HWY_ASSERT(pos <= 200);
    }
  }

 private:
  int fd_;
};

// Returns bytes read, or 0 on failure.
size_t ReadSysfs(const char* format, size_t lp, char* buf200) {
  char path[200];
  const int bytes_written = snprintf(path, sizeof(path), format, lp);
  HWY_ASSERT(0 < bytes_written &&
             bytes_written < static_cast<int>(sizeof(path) - 1));

  const File file(path);
  return file.Read(buf200);
}

// Interprets [str + pos, str + end) as base-10 ASCII. Stops when any non-digit
// is found, or at end. Returns false if no digits found.
bool ParseDigits(const char* str, const size_t end, size_t& pos, size_t* out) {
  HWY_ASSERT(pos <= end);
  // 9 digits cannot overflow even 32-bit size_t.
  const size_t stop = pos + 9;
  *out = 0;
  for (; pos < HWY_MIN(end, stop); ++pos) {
    const int c = str[pos];
    if (c < '0' || c > '9') break;
    *out *= 10;
    *out += static_cast<size_t>(c - '0');
  }
  if (pos == 0) {  // No digits found
    *out = 0;
    return false;
  }
  return true;
}

// Number, plus optional K or M suffix, plus terminator.
bool ParseNumberWithOptionalSuffix(const char* str, size_t len, size_t* out) {
  size_t pos = 0;
  if (!ParseDigits(str, len, pos, out)) return false;
  if (str[pos] == 'K') {
    *out <<= 10;
    ++pos;
  }
  if (str[pos] == 'M') {
    *out <<= 20;
    ++pos;
  }
  if (str[pos] != '\0' && str[pos] != '\n') {
    HWY_ABORT("Expected [suffix] terminator at %zu %s\n", pos, str);
  }
  return true;
}

bool ReadNumberWithOptionalSuffix(const char* format, size_t lp, size_t* out) {
  char buf200[200];
  const size_t pos = ReadSysfs(format, lp, buf200);
  if (pos == 0) return false;
  return ParseNumberWithOptionalSuffix(buf200, pos, out);
}

const char* kPackage =
    "/sys/devices/system/cpu/cpu%zu/topology/physical_package_id";
const char* kCluster = "/sys/devices/system/cpu/cpu%zu/cache/index3/id";
const char* kCore = "/sys/devices/system/cpu/cpu%zu/topology/core_id";
const char* kL2Size = "/sys/devices/system/cpu/cpu%zu/cache/index2/size";
const char* kL3Size = "/sys/devices/system/cpu/cpu%zu/cache/index3/size";
const char* kNode = "/sys/devices/system/node/node%zu/cpulist";

// sysfs values can be arbitrarily large, so store in a map and replace with
// indices in order of appearance.
class Remapper {
 public:
  // Returns false on error, or sets `out_index` to the index of the sysfs
  // value selected by `format` and `lp`.
  template <typename T>
  bool operator()(const char* format, size_t lp, T* HWY_RESTRICT out_index) {
    size_t opaque;
    if (!ReadNumberWithOptionalSuffix(format, lp, &opaque)) return false;

    const auto ib = indices_.insert({opaque, num_});
    num_ += ib.second;                      // increment if inserted
    const size_t index = ib.first->second;  // new or existing
    HWY_ASSERT(index < num_);
    HWY_ASSERT(index < hwy::LimitsMax<T>());
    *out_index = static_cast<T>(index);
    return true;
  }

  size_t Num() const { return num_; }

 private:
  std::map<size_t, size_t> indices_;
  size_t num_ = 0;
};

// For internal use by `DetectPackages`.
struct PerPackage {
  Remapper clusters;
  Remapper cores;
  // We rely on this zero-init and increment it below.
  uint8_t smt_per_core[kMaxLogicalProcessors] = {0};
};

// Initializes `lps` and returns a PackageSizes vector (empty on failure)
// indicating the number of clusters and cores per package.
std::vector<PackageSizes> DetectPackages(std::vector<Topology::LP>& lps) {
  std::vector<PackageSizes> empty;

  Remapper packages;
  for (size_t lp = 0; lp < lps.size(); ++lp) {
    if (!packages(kPackage, lp, &lps[lp].package)) {
      HWY_WARN("Failed to read sysfs package for LP %zu\n", lp);
      return empty;
    }
  }
  std::vector<PerPackage> per_package(packages.Num());
  HWY_ASSERT(!per_package.empty());

  for (size_t lp = 0; lp < lps.size(); ++lp) {
    PerPackage& pp = per_package[lps[lp].package];
    // Not a failure: some CPUs lack a (shared) L3 cache.
    if (!pp.clusters(kCluster, lp, &lps[lp].cluster)) {
      lps[lp].cluster = 0;
    }

    if (!pp.cores(kCore, lp, &lps[lp].core)) {
      HWY_WARN("Failed to read sysfs core for LP %zu\n", lp);
      return empty;
    }

    // SMT ID is how many LP we have already seen assigned to the same core.
    HWY_ASSERT(lps[lp].core < kMaxLogicalProcessors);
    lps[lp].smt = pp.smt_per_core[lps[lp].core]++;
    HWY_ASSERT(lps[lp].smt < 16);
  }

  std::vector<PackageSizes> package_sizes(per_package.size());
  for (size_t p = 0; p < package_sizes.size(); ++p) {
    // Was zero if the package has no shared L3, see above.
    package_sizes[p].num_clusters = HWY_MAX(1, per_package[p].clusters.Num());
    package_sizes[p].num_cores = per_package[p].cores.Num();
    HWY_ASSERT(package_sizes[p].num_cores != 0);
  }
  return package_sizes;
}

std::vector<size_t> ExpandList(const char* list, size_t list_end,
                               size_t max_lp) {
  std::vector<size_t> expanded;
  constexpr size_t kNotFound = ~size_t{0};
  size_t pos = 0;

  // Gracefully handle empty lists, happens on GH200 systems (#2668).
  if (isspace(list[0]) && list_end <= 2) return expanded;

  // Returns first `found_pos >= pos` where `list[found_pos] == c`, or
  // `kNotFound`.
  const auto find = [list, list_end, &pos](char c) -> size_t {
    const char* found_ptr = strchr(list + pos, c);
    if (found_ptr == nullptr) return kNotFound;
    const size_t found_pos = static_cast<size_t>(found_ptr - list);
    HWY_ASSERT(found_pos < list_end && list[found_pos] == c);
    return found_pos;
  };

  // Reads LP number and advances `pos`. `end` is for verifying we did not
  // read past a known terminator, or the end of string.
  const auto parse_lp = [list, list_end, &pos, max_lp](size_t end) -> size_t {
    end = HWY_MIN(end, list_end);
    size_t lp;
    HWY_ASSERT(ParseDigits(list, end, pos, &lp));
    HWY_IF_CONSTEXPR(HWY_ARCH_RISCV) {
      // On RISC-V, both TotalLogicalProcessors and GetThreadAffinity may
      // under-report the count, hence clamp.
      lp = HWY_MIN(lp, max_lp);
    }
    HWY_ASSERT(lp <= max_lp);
    HWY_ASSERT(pos <= end);
    return lp;
  };

  // Parse all [first-]last separated by commas.
  for (;;) {
    // Single number or first of range: ends with dash, comma, or end.
    const size_t lp_range_first = parse_lp(HWY_MIN(find('-'), find(',')));

    if (list[pos] == '-') {  // range
      ++pos;                 // skip dash
      // Last of range ends with comma or end.
      const size_t lp_range_last = parse_lp(find(','));

      expanded.reserve(expanded.size() + lp_range_last - lp_range_first + 1);
      for (size_t lp = lp_range_first; lp <= lp_range_last; ++lp) {
        expanded.push_back(lp);
      }
    } else {  // single number
      expanded.push_back(lp_range_first);
    }

    // Done if reached end of string.
    if (pos == list_end || list[pos] == '\0' || list[pos] == '\n') {
      break;
    }
    // Comma means at least one more term is coming.
    if (list[pos] == ',') {
      ++pos;
      continue;
    }
    HWY_ABORT("Unexpected character at %zu in %s\n", pos, list);
  }  // for pos

  return expanded;
}

// Sets LP.node for all `lps`.
void SetNodes(std::vector<Topology::LP>& lps) {
  // For each NUMA node found via sysfs:
  for (size_t node = 0;; node++) {
    // Read its cpulist so we can scatter `node` to all its `lps`.
    char buf200[200];
    const size_t bytes_read = ReadSysfs(kNode, node, buf200);
    if (bytes_read == 0) break;
    const std::vector<size_t> list =
        ExpandList(buf200, bytes_read, lps.size() - 1);
    for (size_t lp : list) {
      lps[lp].node = static_cast<uint8_t>(node);
    }
  }
}

void SetClusterCacheSizes(std::vector<Topology::Package>& packages) {
  for (size_t ip = 0; ip < packages.size(); ++ip) {
    Topology::Package& p = packages[ip];
    for (size_t ic = 0; ic < p.clusters.size(); ++ic) {
      Topology::Cluster& c = p.clusters[ic];
      const size_t lp = c.lps.First();
      size_t bytes;
      if (ReadNumberWithOptionalSuffix(kL2Size, lp, &bytes)) {
        c.private_kib = bytes >> 10;
      }
      if (ReadNumberWithOptionalSuffix(kL3Size, lp, &bytes)) {
        c.shared_kib = bytes >> 10;
      }
    }
  }
}

#elif HWY_OS_WIN

// Also sets LP.core and LP.smt.
size_t MaxLpsPerCore(std::vector<Topology::LP>& lps) {
  size_t max_lps_per_core = 0;
  size_t core_idx = 0;
  (void)ForEachSLPI(RelationProcessorCore, [&max_lps_per_core, &core_idx,
                                            &lps](const SLPI& info) {
    const PROCESSOR_RELATIONSHIP& p = info.Processor;
    const size_t lps_per_core = NumBits(p.GroupCount, p.GroupMask);
    max_lps_per_core = HWY_MAX(max_lps_per_core, lps_per_core);

    size_t smt = 0;
    ForeachBit(p.GroupCount, p.GroupMask, lps, __LINE__,
               [core_idx, &smt](size_t lp, std::vector<Topology::LP>& lps) {
                 lps[lp].core = static_cast<uint16_t>(core_idx);
                 lps[lp].smt = static_cast<uint8_t>(smt++);
               });
    ++core_idx;
  });
  HWY_ASSERT(max_lps_per_core != 0);
  return max_lps_per_core;
}

// Interprets cluster (typically a shared L3 cache) as a "processor die". Also
// sets LP.cluster.
size_t MaxCoresPerCluster(const size_t max_lps_per_core,
                          std::vector<Topology::LP>& lps) {
  size_t max_cores_per_cluster = 0;
  size_t cluster_idx = 0;
  // Shared between `foreach_die` and `foreach_l3`.
  const auto foreach_cluster = [&](size_t num_groups,
                                   const GROUP_AFFINITY* groups) {
    const size_t lps_per_cluster = NumBits(num_groups, groups);
    // `max_lps_per_core` is an upper bound, hence round up. It is not an error
    // if there is only one core per cluster - can happen for L3.
    const size_t cores_per_cluster = DivCeil(lps_per_cluster, max_lps_per_core);
    max_cores_per_cluster = HWY_MAX(max_cores_per_cluster, cores_per_cluster);

    ForeachBit(num_groups, groups, lps, __LINE__,
               [cluster_idx](size_t lp, std::vector<Topology::LP>& lps) {
                 lps[lp].cluster = static_cast<uint16_t>(cluster_idx);
               });
    ++cluster_idx;
  };

  // Passes group bits to `foreach_cluster`, depending on relationship type.
  const auto foreach_die = [&foreach_cluster](const SLPI& info) {
    const PROCESSOR_RELATIONSHIP& p = info.Processor;
    foreach_cluster(p.GroupCount, p.GroupMask);
  };
  const auto foreach_l3 = [&foreach_cluster](const SLPI& info) {
    const CACHE_RELATIONSHIP& cr = info.Cache;
    if (cr.Type != CacheUnified && cr.Type != CacheData) return;
    if (cr.Level != 3) return;
    foreach_cluster(cr.GroupCount, cr.GroupMasks);
  };

  if (!ForEachSLPI(RelationProcessorDie, foreach_die)) {
    // Has been observed to fail; also check for shared L3 caches.
    (void)ForEachSLPI(RelationCache, foreach_l3);
  }
  if (max_cores_per_cluster == 0) {
    HWY_WARN("All clusters empty, assuming 1 core each\n");
    max_cores_per_cluster = 1;
  }
  return max_cores_per_cluster;
}

// Initializes `lps` and returns a `PackageSizes` vector (empty on failure)
// indicating the number of clusters and cores per package.
std::vector<PackageSizes> DetectPackages(std::vector<Topology::LP>& lps) {
  const size_t max_lps_per_core = MaxLpsPerCore(lps);
  const size_t max_cores_per_cluster =
      MaxCoresPerCluster(max_lps_per_core, lps);

  std::vector<PackageSizes> packages;
  size_t package_idx = 0;
  (void)ForEachSLPI(RelationProcessorPackage, [&](const SLPI& info) {
    const PROCESSOR_RELATIONSHIP& p = info.Processor;
    const size_t lps_per_package = NumBits(p.GroupCount, p.GroupMask);
    PackageSizes ps;  // avoid designated initializers for MSVC
    ps.num_clusters = max_cores_per_cluster;
    // `max_lps_per_core` is an upper bound, hence round up.
    ps.num_cores = DivCeil(lps_per_package, max_lps_per_core);
    packages.push_back(ps);

    ForeachBit(p.GroupCount, p.GroupMask, lps, __LINE__,
               [package_idx](size_t lp, std::vector<Topology::LP>& lps) {
                 lps[lp].package = static_cast<uint8_t>(package_idx);
               });
    ++package_idx;
  });

  return packages;
}

// Sets LP.node for all `lps`.
void SetNodes(std::vector<Topology::LP>& lps) {
  // Zero-initialize all nodes in case the below fails.
  for (size_t lp = 0; lp < lps.size(); ++lp) {
    lps[lp].node = 0;
  }

  // We want the full NUMA nodes, but Windows Server 2022 truncates the results
  // of `RelationNumaNode` to a single 64-LP group. To get the old, unlimited
  // behavior without using the new `RelationNumaNodeEx` symbol, use the old
  // `RelationAll` and filter the SLPI we want.
  (void)ForEachSLPI(RelationAll, [&](const SLPI& info) {
    if (info.Relationship != RelationNumaNode) return;
    const NUMA_NODE_RELATIONSHIP& nn = info.NumaNode;
    // This field was previously reserved/zero. There is at least one group.
    const size_t num_groups = HWY_MAX(1, nn.GroupCount);
    const uint8_t node = static_cast<uint8_t>(nn.NodeNumber);
    ForeachBit(num_groups, nn.GroupMasks, lps, __LINE__,
               [node](size_t lp, std::vector<Topology::LP>& lps) {
                 lps[lp].node = node;
               });
  });
}

#elif HWY_OS_APPLE

// Initializes `lps` and returns a `PackageSizes` vector (empty on failure)
// indicating the number of clusters and cores per package.
std::vector<PackageSizes> DetectPackages(std::vector<Topology::LP>& lps) {
  int err;

  size_t total_cores = 0;
  if (!Sysctl("hw.perflevel0.physicalcpu", 1, err, &total_cores)) {
    HWY_WARN("Error %d detecting total_cores, assuming one per LP\n", err);
    total_cores = lps.size();
  }

  if (lps.size() % total_cores != 0) {
    HWY_WARN("LPs %zu not a multiple of total_cores %zu\n", lps.size(),
             total_cores);
  }
  const size_t lp_per_core = DivCeil(lps.size(), total_cores);

  size_t cores_per_cluster = 0;
  if (!Sysctl("hw.perflevel0.cpusperl2", 1, err, &cores_per_cluster)) {
    HWY_WARN("Error %d detecting cores_per_cluster\n", err);
    cores_per_cluster = HWY_MIN(4, total_cores);
  }

  if (total_cores % cores_per_cluster != 0) {
    HWY_WARN("total_cores %zu not a multiple of cores_per_cluster %zu\n",
             total_cores, cores_per_cluster);
  }

  for (size_t lp = 0; lp < lps.size(); ++lp) {
    lps[lp].package = 0;  // single package
    lps[lp].core = static_cast<uint16_t>(lp / lp_per_core);
    lps[lp].smt = static_cast<uint8_t>(lp % lp_per_core);
    lps[lp].cluster = static_cast<uint16_t>(lps[lp].core / cores_per_cluster);
  }

  PackageSizes ps;
  ps.num_clusters = DivCeil(total_cores, cores_per_cluster);
  ps.num_cores = total_cores;
  return std::vector<PackageSizes>{ps};
}

// Sets LP.node for all `lps`.
void SetNodes(std::vector<Topology::LP>& lps) {
  for (size_t lp = 0; lp < lps.size(); ++lp) {
    lps[lp].node = 0;  // no NUMA
  }
}

#endif  // HWY_OS_*

#if HWY_OS_WIN || HWY_OS_APPLE

void SetClusterCacheSizes(std::vector<Topology::Package>& packages) {
  // Assumes clusters are homogeneous. Otherwise, we would have to scan
  // `RelationCache` again and find the corresponding package_idx.
  const Cache* caches = DataCaches();
  const size_t private_kib = caches ? caches[2].size_kib : 0;
  const size_t shared_kib = caches ? caches[3].size_kib : 0;

  for (size_t ip = 0; ip < packages.size(); ++ip) {
    Topology::Package& p = packages[ip];
    for (size_t ic = 0; ic < p.clusters.size(); ++ic) {
      Topology::Cluster& c = p.clusters[ic];
      c.private_kib = private_kib;
      c.shared_kib = shared_kib;
    }
  }
}

#endif  // HWY_OS_WIN || HWY_OS_APPLE

}  // namespace

HWY_CONTRIB_DLLEXPORT Topology::Topology() {
#if HWY_OS_LINUX || HWY_OS_WIN || HWY_OS_APPLE
  lps.resize(TotalLogicalProcessors());
  const std::vector<PackageSizes>& package_sizes = DetectPackages(lps);
  if (package_sizes.empty()) return;
  SetNodes(lps);

  // Allocate per-package/cluster/core vectors. This indicates to callers that
  // detection succeeded.
  packages.resize(package_sizes.size());
  for (size_t p = 0; p < packages.size(); ++p) {
    packages[p].clusters.resize(package_sizes[p].num_clusters);
    packages[p].cores.resize(package_sizes[p].num_cores);
  }

  // Populate the per-cluster/core sets of LP.
  for (size_t lp = 0; lp < lps.size(); ++lp) {
    Package& p = packages[lps[lp].package];
    p.clusters[lps[lp].cluster].lps.Set(lp);
    p.cores[lps[lp].core].lps.Set(lp);
  }

  SetClusterCacheSizes(packages);
#endif  // HWY_OS_*
}

// ------------------------------ Cache detection

namespace {

using Caches = std::array<Cache, 4>;

// We assume homogeneous caches across all clusters because some OS APIs return
// a single value for a class of CPUs.

#if HWY_OS_LINUX
std::string ReadString(const char* name, size_t index) {
  // First CPU is usually a P core.
  const std::string path("/sys/devices/system/cpu/cpu0/cache/index%zu/");
  char buf200[200];
  size_t end = ReadSysfs((path + name).c_str(), index, buf200);
  // Remove trailing newline/null to simplify string comparison.
  for (; end != 0; --end) {
    if (buf200[end - 1] != '\0' && buf200[end - 1] != '\n') break;
  }
  return std::string(buf200, buf200 + end);
}

template <typename T>
bool WriteSysfs(const char* name, size_t index, T* out) {
  const std::string str = ReadString(name, index);
  // Do not call `ParseNumberWithOptionalSuffix` because it acts on the
  // K suffix in "size", but we actually want KiB.
  size_t pos = 0;
  size_t val;
  if (!ParseDigits(str.c_str(), str.length(), pos, &val)) return false;
  HWY_ASSERT(pos <= str.length());
  *out = static_cast<T>(val);
  return true;
}

// Reading from sysfs is preferred because sysconf returns L3 associativity = 0
// on some CPUs, and does not indicate sharing across cores.
// https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-system-cpu
bool InitCachesSysfs(Caches& caches) {
  // For computing shared cache sizes.
  std::vector<hwy::Topology::LP> lps(TotalLogicalProcessors());
  const std::vector<PackageSizes> package_sizes = DetectPackages(lps);
  // `package_sizes` is only used to check that `lps` were filled.
  if (package_sizes.empty()) {
    HWY_WARN("no packages, shared cache sizes may be incorrect\n");
    return false;
  }

  for (size_t i = 0;; ++i) {
    const std::string type = ReadString("type", i);
    if (type.empty()) break;  // done, no more entries
    if (type != "Data" && type != "Unified") continue;
    uint32_t level;
    if (!WriteSysfs("level", i, &level)) continue;
    if (level != 1 && level != 2 && level != 3) continue;
    Cache& c = caches[level];

    // Check before overwriting any fields.
    if (c.size_kib != 0) {
      HWY_WARN("ignoring another L%u, first size %u\n", level, c.size_kib);
      continue;
    }

    const bool ok = WriteSysfs("size", i, &c.size_kib) &&
                    WriteSysfs("ways_of_associativity", i, &c.associativity) &&
                    WriteSysfs("number_of_sets", i, &c.sets);
    if (HWY_UNLIKELY(!ok)) {
      HWY_WARN("skipping partially-detected L%u, error %d\n", level, errno);
      c = Cache();
      continue;
    }

    // Compute line size *before* adjusting the size for sharing. Note that
    // `coherency_line_size` exists, but we are not sure that is the line size.
    const size_t bytes = static_cast<size_t>(c.size_kib) * 1024;
    const size_t lines = c.associativity * c.sets;
    c.bytes_per_line = static_cast<uint16_t>(DivByFactor(bytes, lines));

    // Divide by number of *cores* sharing the cache.
    const std::string shared_str = ReadString("shared_cpu_list", i);
    if (HWY_UNLIKELY(shared_str.empty())) {
      HWY_WARN("no shared_cpu_list for L%u %s\n", level, type.c_str());
      c.cores_sharing = 1;
    } else {
      const std::vector<size_t> shared_lps =
          ExpandList(shared_str.c_str(), shared_str.length(), lps.size() - 1);
      size_t num_cores = 0;
      for (size_t lp : shared_lps) {
        if (HWY_LIKELY(lp < lps.size())) {
          num_cores += lps[lp].smt == 0;
        } else {
          HWY_WARN("out of bounds lp %zu of %zu from %s\n", lp, lps.size(),
                   shared_str.c_str());
        }
      }
      if (num_cores == 0) {
        HWY_WARN("no cores sharing L%u %s, setting to 1\n", level,
                 type.c_str());
        num_cores = 1;
      }
      c.cores_sharing = static_cast<uint16_t>(num_cores);
      // There exist CPUs for which L3 is not evenly divisible by `num_cores`,
      // hence do not use `DivByFactor`. It is safer to round down.
      c.size_kib = static_cast<uint32_t>(c.size_kib / num_cores);
      c.sets = static_cast<uint32_t>(c.sets / num_cores);
    }
  }

  // Require L1 and L2 cache.
  if (HWY_UNLIKELY(caches[1].size_kib == 0 || caches[2].size_kib == 0)) {
// Don't complain on Android because this is known to happen there. We are
// unaware of good alternatives: `getauxval(AT_L1D_CACHEGEOMETRY)` and
// `sysconf(_SC_LEVEL1_DCACHE_SIZE)` are unreliable, detecting via timing seems
// difficult to do reliably, and we do not want to maintain lists of known CPUs
// and their properties. It's OK to return false; callers are responsible for
// assuming reasonable defaults.
#ifndef __ANDROID__
    HWY_WARN("sysfs detected L1=%u L2=%u, err %x\n", caches[1].size_kib,
             caches[2].size_kib, errno);
#endif
    return false;
  }

  // L3 is optional; if not found, its size is already zero from static init.
  return true;
}

#elif HWY_OS_WIN

bool InitCachesWin(Caches& caches) {
  std::vector<hwy::Topology::LP> lps(TotalLogicalProcessors());
  const size_t max_lps_per_core = MaxLpsPerCore(lps);

  (void)ForEachSLPI(RelationCache, [max_lps_per_core,
                                    &caches](const SLPI& info) {
    const CACHE_RELATIONSHIP& cr = info.Cache;
    if (cr.Type != CacheUnified && cr.Type != CacheData) return;
    if (1 <= cr.Level && cr.Level <= 3) {
      Cache& c = caches[cr.Level];
      // If the size is non-zero then we (probably) have already detected this
      // cache and can skip the CR.
      if (c.size_kib > 0) return;
      c.size_kib = static_cast<uint32_t>(DivByFactor(cr.CacheSize, 1024));
      c.bytes_per_line = static_cast<uint16_t>(cr.LineSize);
      c.associativity = (cr.Associativity == CACHE_FULLY_ASSOCIATIVE)
                            ? Cache::kMaxAssociativity
                            : cr.Associativity;

      // How many cores share this cache?
      size_t shared_with = NumBits(cr.GroupCount, cr.GroupMasks);
      // Divide out hyperthreads. This core may have fewer than
      // `max_lps_per_core`, hence round up.
      shared_with = DivCeil(shared_with, max_lps_per_core);
      if (shared_with == 0) {
        HWY_WARN("no cores sharing L%u, setting to 1\n", cr.Level);
        shared_with = 1;
      }

      // Update `size_kib` to *per-core* portion.
      // There exist CPUs for which L3 is not evenly divisible by `shared_with`,
      // hence do not use `DivByFactor`. It is safer to round down.
      c.size_kib = static_cast<uint32_t>(c.size_kib / shared_with);
      c.cores_sharing = static_cast<uint16_t>(shared_with);
    }
  });

  // Require L1 and L2 cache.
  if (HWY_UNLIKELY(caches[1].size_kib == 0 || caches[2].size_kib == 0)) {
    HWY_WARN("Windows detected L1=%u, L2=%u, err %lx\n", caches[1].size_kib,
             caches[2].size_kib, GetLastError());
    return false;
  }

  // L3 is optional; if not found, its size is already zero from static init.
  return true;
}

#elif HWY_OS_APPLE

bool InitCachesApple(Caches& caches) {
  int err = 0;
  Cache& L1 = caches[1];
  Cache& L2 = caches[2];
  Cache& L3 = caches[3];

  // Total L1 and L2 size can be reliably queried, but prefer perflevel0
  // (P-cores) because hw.l1dcachesize etc. are documented to describe the
  // "least performant core".
  bool ok = Sysctl("hw.perflevel0.l1dcachesize", 1024, err, &L1.size_kib) ||
            Sysctl("hw.l1dcachesize", 1024, err, &L1.size_kib);
  ok &= Sysctl("hw.perflevel0.l2cachesize", 1024, err, &L2.size_kib) ||
        Sysctl("hw.l2cachesize", 1024, err, &L2.size_kib);
  if (HWY_UNLIKELY(!ok)) {
    HWY_WARN("Apple cache detection failed, error %d\n", err);
    return false;
  }
  L1.cores_sharing = 1;
  if (Sysctl("hw.perflevel0.cpusperl2", 1, err, &L2.cores_sharing)) {
    // There exist CPUs for which L2 is not evenly divisible by `cores_sharing`,
    // hence do not use `DivByFactor`. It is safer to round down.
    L2.size_kib /= L2.cores_sharing;
  } else {
    L2.cores_sharing = 1;
  }

  // Other properties are not always reported. Set `associativity` and
  // `bytes_per_line` based on known models.
  char brand[128] = {0};
  size_t size = sizeof(brand);
  if (!sysctlbyname("machdep.cpu.brand_string", brand, &size, nullptr, 0)) {
    if (strncmp(brand, "Apple ", 6) != 0) {
      // Unexpected, but we will continue check the string suffixes.
      HWY_WARN("unexpected Apple brand %s\n", brand);
    }

    if (brand[6] == 'M') {
      // https://dougallj.github.io/applecpu/firestorm.html,
      // https://www.7-cpu.com/cpu/Apple_M1.html:
      L1.bytes_per_line = 64;
      L1.associativity = 8;
      L2.bytes_per_line = 128;
      if (brand[7] == '1') {  // M1
        L2.associativity = 12;
      } else if ('2' <= brand[7] && brand[7] <= '4') {  // M2/M3, maybe also M4
        L2.associativity = 16;
      } else {
        L2.associativity = 0;  // Unknown, set below via sysctl.
      }

      // Although Wikipedia lists SLC sizes per model, we do not know how it is
      // partitioned/allocated, so do not treat it as a reliable L3.
    }  // M*
  }  // brand string

  // This sysctl does not distinguish between L1 and L2 line sizes, so only use
  // it if we have not already set `bytes_per_line` above.
  uint16_t bytes_per_line;
  if (!Sysctl("hw.cachelinesize", 1, err, &bytes_per_line)) {
    bytes_per_line = static_cast<uint16_t>(HWY_ALIGNMENT);  // guess
  }
  for (size_t level = 1; level <= 3; ++level) {
    if (caches[level].bytes_per_line == 0) {
      caches[level].bytes_per_line = bytes_per_line;
    }
  }

  // Fill in associativity if not already set. Unfortunately this is only
  // reported on x86, not on M*.
  if (L1.associativity == 0 && !Sysctl("machdep.cpu.cache.L1_associativity", 1,
                                       err, &L1.associativity)) {
    L1.associativity = 8;  // guess
  }
  if (L2.associativity == 0 && !Sysctl("machdep.cpu.cache.L2_associativity", 1,
                                       err, &L2.associativity)) {
    L2.associativity = 12;  // guess
  }
  // There is no L3_associativity.
  if (L3.associativity == 0) {
    L3.associativity = 12;  // guess
  }

  // Now attempt to query L3. Although this sysctl is documented, M3 does not
  // report an L3 cache.
  if (L3.size_kib == 0 &&
      (Sysctl("hw.perflevel0.l3cachesize", 1024, err, &L3.size_kib) ||
       Sysctl("hw.l3cachesize", 1024, err, &L3.size_kib))) {
    // There exist CPUs for which L3 is not evenly divisible by `cores_sharing`,
    // hence do not use `DivByFactor`. It is safer to round down.
    if (Sysctl("hw.perflevel0.cpusperl3", 1, err, &L3.cores_sharing)) {
      L3.size_kib /= L3.cores_sharing;
    } else {
      L3.cores_sharing = 1;
    }
  }
  // If no L3 cache, reset all fields for consistency.
  if (L3.size_kib == 0) {
    L3 = Cache();
  }

  // Are there other useful sysctls? hw.cacheconfig appears to be how many
  // cores share the memory and caches, though this is not documented, and
  // duplicates information in hw.perflevel0.cpusperl*.

  return true;
}

#endif  // HWY_OS_*

// Most APIs do not set the `sets` field, so compute it from the size and
// associativity, and if a value is already set, ensure it matches.
HWY_MAYBE_UNUSED void ComputeSets(Cache& c) {
  // If there is no such cache, avoid division by zero.
  if (HWY_UNLIKELY(c.size_kib == 0)) {
    c.sets = 0;
    return;
  }
  const size_t bytes = static_cast<size_t>(c.size_kib) * 1024;
  // `size_kib` may have been rounded down, hence `lines` and `sets` are not
  // necessarily evenly divisible, so round down instead of `DivByFactor`.
  const size_t lines = bytes / c.bytes_per_line;
  const size_t sets = lines / c.associativity;

  if (c.sets == 0) {
    c.sets = static_cast<uint32_t>(sets);
  } else {
    const size_t diff = c.sets - sets;
    if (diff > 1) {
      HWY_ABORT("Inconsistent cache sets %u != %zu\n", c.sets, sets);
    }
  }
}

const Cache* InitDataCaches() {
  alignas(64) static Caches caches;

  // On failure, return immediately because InitCaches*() already warn.
#if HWY_OS_LINUX
  if (HWY_UNLIKELY(!InitCachesSysfs(caches))) return nullptr;
#elif HWY_OS_WIN
  if (HWY_UNLIKELY(!InitCachesWin(caches))) return nullptr;
#elif HWY_OS_APPLE
  if (HWY_UNLIKELY(!InitCachesApple(caches))) return nullptr;
#else
  HWY_WARN("Cache detection not implemented for this platform.\n");
  (void)caches;
  return nullptr;
#define HWY_NO_CACHE_DETECTION
#endif

  // Prevents "code not reached" warnings on WASM.
#ifndef HWY_NO_CACHE_DETECTION
  for (size_t level = 1; level <= 3; ++level) {
    ComputeSets(caches[level]);
  }

  // Heuristic to ignore SLCs such as on Ampere Altra, which should not be
  // treated as a reliable L3 because of their cache inclusion policy.
  // On Apple M*, these are not even reported as an L3.
  if (caches[3].cores_sharing >= 16 && caches[3].size_kib <= 512) {
    caches[3] = Cache();
  }

  return &caches[0];
#endif  // HWY_NO_CACHE_DETECTION
}

}  // namespace

HWY_CONTRIB_DLLEXPORT const Cache* DataCaches() {
  static const Cache* caches = InitDataCaches();
  return caches;
}

}  // namespace hwy