1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
name: Benchmarks
on:
push:
branches:
- main
paths:
- 'benches/**'
- 'src/**'
- 'Cargo.toml'
- 'Cargo.lock'
- '.github/workflows/benchmark.yml'
pull_request:
paths:
- 'benches/**'
- 'src/**'
- 'Cargo.toml'
- 'Cargo.lock'
- '.github/workflows/benchmark.yml'
workflow_dispatch:
env:
CARGO_TERM_COLOR: always
RUST_BACKTRACE: 1
permissions:
contents: read
jobs:
benchmark:
name: ${{ matrix.label }}
strategy:
fail-fast: false
matrix:
include:
# aarch64 NEON — runtime dispatcher picks NEON. Every
# `skip::skip_*` and `skip_class!`-generated fn has a hand-tuned
# NEON code path exercised here.
- os: macos-latest
arch: aarch64
tier: neon
rustflags: ''
label: macos-aarch64-neon
# aarch64 with NEON forced off via `memspan_force_scalar`: the
# dispatcher takes the scalar fallback on every call. Gives a
# like-for-like scalar baseline on the same hardware as the NEON
# tier above, so the NEON win is measurable without cross-runner
# noise.
- os: macos-latest
arch: aarch64
tier: scalar
rustflags: '--cfg memspan_force_scalar'
label: macos-aarch64-scalar
# x86_64 default — runtime dispatcher picks whichever x86 tier
# the runner supports. Standard ubuntu-latest is AMD EPYC 7763
# (Milan): AVX2 yes, AVX-512 no. This tier exercises the AVX2
# kernel in practice.
- os: ubuntu-latest
arch: x86_64
tier: default
rustflags: ''
label: ubuntu-x86_64-default
# Note: no AVX-512 bench tier. GitHub-hosted free runners are
# AMD Milan (no AVX-512), and emulated numbers from Intel SDE
# are ~5-10× off real hardware — not worth measuring. AVX-512
# correctness is covered by the `test-sde` job in ci.yml.
# x86_64 with AVX-512 disabled: forces the AVX2 dispatch branch
# on runners that would otherwise pick AVX-512. Gives explicit
# AVX2-tier numbers regardless of runner CPU.
- os: ubuntu-latest
arch: x86_64
tier: avx2-max
rustflags: '--cfg memspan_disable_avx512'
label: ubuntu-x86_64-avx2-max
# x86_64 with AVX-512 and AVX2 both disabled: forces the SSE4.2
# dispatch branch. Every x86_64 CPU since ~2008 has SSE4.2, so
# this tier exercises the SSE4.2 kernel on every runner.
- os: ubuntu-latest
arch: x86_64
tier: sse42-max
rustflags: '--cfg memspan_disable_avx512 --cfg memspan_disable_avx2'
label: ubuntu-x86_64-sse42-max
# x86_64 with every SIMD backend short-circuited: scalar-only
# baseline. Pairs with `ubuntu-x86_64-default` to measure the
# SIMD win on Linux/x86_64.
- os: ubuntu-latest
arch: x86_64
tier: scalar
rustflags: '--cfg memspan_force_scalar'
label: ubuntu-x86_64-scalar
# x86_64 with `-C target-cpu=native`: enables the full feature
# set of the runner's build-time CPU for LLVM auto-vectorization
# of scalar paths and maximum codegen quality for SIMD kernels.
#
# `native` uses `target_rustflags` (routed via the per-target
# `CARGO_TARGET_*_RUSTFLAGS` env var) instead of the global
# `rustflags` field. The global `RUSTFLAGS` applies to *every*
# rustc invocation, including proc macro dylibs built for the
# host, which then get codegen'd with host-CPU-specific
# instructions and SIGILL when rustc loads them in a different
# execution context. The per-target var scopes
# `-C target-cpu=native` to the target crate only.
- os: ubuntu-latest
arch: x86_64
tier: native
rustflags: ''
target_rustflags: '-C target-cpu=native'
label: ubuntu-x86_64-native
# Windows x86_64 — same dispatcher as Linux but validates the
# MSVC toolchain handles the intrinsics-heavy modules.
- os: windows-latest
arch: x86_64
tier: default
rustflags: ''
label: windows-x86_64-default
runs-on: ${{ matrix.os }}
env:
RUSTFLAGS: ${{ matrix.rustflags }}
# Per-target rustflags: only affects compilation of crates built
# for the named triple, never build-dependencies (proc macros,
# build scripts). The `native` tier uses this instead of
# `RUSTFLAGS` to avoid SIGILL when rustc loads host-compiled
# proc-macro dylibs that were codegen'd with `target-cpu=native`.
# Empty for tiers that don't opt in — cargo treats empty as unset.
CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS: ${{ matrix.target_rustflags || '' }}
steps:
- uses: actions/checkout@v6
- name: Install Rust
run: rustup update stable --no-self-update && rustup default stable
- name: Print CPU info (Linux)
if: runner.os == 'Linux'
shell: bash
run: |
echo "=== /proc/cpuinfo (first flags line) ==="
grep -m1 '^flags' /proc/cpuinfo || true
echo "=== lscpu ==="
lscpu || true
- name: Print CPU info (macOS)
if: runner.os == 'macOS'
shell: bash
run: |
echo "=== sysctl machdep.cpu ==="
sysctl machdep.cpu || true
echo "=== uname -m ==="
uname -m
- name: Print CPU info (Windows)
if: runner.os == 'Windows'
shell: pwsh
run: |
Get-CimInstance Win32_Processor | Select-Object Name, Manufacturer, NumberOfCores, NumberOfLogicalProcessors | Format-List
- name: Cache cargo build and registry
uses: actions/cache@v5
with:
path: |
~/.cargo/registry
~/.cargo/git
target
key: ${{ runner.os }}-bench-${{ matrix.tier }}-${{ hashFiles('**/Cargo.lock') }}
restore-keys: |
${{ runner.os }}-bench-${{ matrix.tier }}-
${{ runner.os }}-bench-
- name: Run benchmarks
shell: bash
run: cargo bench --benches -- --quick --output-format criterion | tee benchmark-all-${{ matrix.label }}.txt
continue-on-error: false
- name: Collect system info
shell: bash
run: |
info="benchmark-info-${{ matrix.label }}.md"
echo "## ${{ matrix.label }}" > "$info"
echo "- OS: ${{ matrix.os }}" >> "$info"
echo "- Arch: ${{ matrix.arch }}" >> "$info"
echo "- SIMD tier: ${{ matrix.tier }}" >> "$info"
echo "- Runner: ${{ runner.name }}" >> "$info"
echo "- Runner arch (GH): ${{ runner.arch }}" >> "$info"
echo "- RUSTFLAGS: \`${{ matrix.rustflags }}\`" >> "$info"
if [ -n "${{ matrix.target_rustflags || '' }}" ]; then
echo "- CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS: \`${{ matrix.target_rustflags || '' }}\`" >> "$info"
fi
echo "- Date: $(date -u +"%Y-%m-%d %H:%M:%S UTC")" >> "$info"
- name: Upload benchmark results
uses: actions/upload-artifact@v7
with:
name: benchmark-results-${{ matrix.label }}
path: |
benchmark-all-${{ matrix.label }}.txt
benchmark-info-${{ matrix.label }}.md
target/criterion/
retention-days: 90