1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
name: Benchmarks
on:
push:
branches:
- main
paths:
- 'benches/**'
- 'src/**'
- 'Cargo.toml'
- 'Cargo.lock'
- '.github/workflows/benchmark.yml'
pull_request:
paths:
- 'benches/**'
- 'src/**'
- 'Cargo.toml'
- 'Cargo.lock'
- '.github/workflows/benchmark.yml'
workflow_dispatch:
env:
CARGO_TERM_COLOR: always
RUST_BACKTRACE: 1
# `contents: read` is sufficient — we no longer comment on PRs.
# Reviewers download the Criterion artifacts manually from the workflow
# run page.
permissions:
contents: read
jobs:
benchmark:
name: ${{ matrix.label }}
strategy:
matrix:
include:
# aarch64 NEON — runtime dispatcher picks NEON; scalar variant in
# each bench exercised via `use_simd=false`.
- os: macos-latest
arch: aarch64
tier: neon
rustflags: ''
label: macos-aarch64-neon
# aarch64 with NEON short-circuited via `colconv_force_scalar`:
# dispatcher takes the scalar path on every call, producing a
# scalar baseline that matches the one measured inside the
# `use_simd=false` bench variant but with dispatcher branches
# also uncovered-then-covered for coverage fidelity.
- os: macos-latest
arch: aarch64
tier: scalar
rustflags: '--cfg colconv_force_scalar'
label: macos-aarch64-scalar
# x86_64 default — runtime dispatcher picks whichever x86 tier
# the runner supports. Standard ubuntu-latest is AMD EPYC 7763
# (Milan) which has AVX2 but NOT AVX-512, so this tier ends up
# exercising the AVX2 kernel in practice. Use the -avx512 row
# below for actual AVX-512 coverage.
- os: ubuntu-latest
arch: x86_64
tier: default
rustflags: ''
label: ubuntu-x86_64-default
# Note: no AVX-512 bench tier. GitHub-hosted free runners are
# AMD Milan (no AVX-512), and emulated numbers from Intel SDE
# are ~5-10x off real hardware — not worth measuring. Test
# correctness of the AVX-512 kernel is covered by the
# `test-sde` job in ci.yml instead.
# x86_64 with AVX-512 disabled: forces the AVX2 dispatch branch
# on runners that would otherwise always pick AVX-512. Gives
# explicit AVX2-tier numbers regardless of runner CPU.
- os: ubuntu-latest
arch: x86_64
tier: avx2-max
rustflags: '--cfg colconv_disable_avx512'
label: ubuntu-x86_64-avx2-max
# x86_64 with AVX-512 and AVX2 both disabled: forces the SSE4.1
# dispatch branch. Every x86_64 CPU since ~2008 has SSE4.1, so
# this tier exercises the SSE4.1 kernel on every runner.
- os: ubuntu-latest
arch: x86_64
tier: sse41-max
rustflags: '--cfg colconv_disable_avx512 --cfg colconv_disable_avx2'
label: ubuntu-x86_64-sse41-max
# x86_64 with every SIMD backend short-circuited: scalar-only
# baseline. Complements `use_simd=false` variants inside each
# bench (this tier also routes the dispatcher itself to scalar).
- os: ubuntu-latest
arch: x86_64
tier: scalar
rustflags: '--cfg colconv_force_scalar'
label: ubuntu-x86_64-scalar
# x86_64 with `-C target-cpu=native`: enables the full feature
# set of the runner's build-time CPU for LLVM auto-vectorization
# of scalar paths and maximum codegen quality for SIMD kernels.
#
# `native` uses `target_rustflags` (routed via the per-target
# `CARGO_TARGET_*_RUSTFLAGS` env var) instead of the global
# `rustflags` field. The global `RUSTFLAGS` applies to *every*
# rustc invocation, including proc macro dylibs built for the
# host (`thiserror_impl`, etc.), which then get codegen'd with
# host-CPU-specific instructions and SIGILL when rustc loads
# them in a different execution context. The per-target var
# scopes `-C target-cpu=native` to the target crate only, so
# proc macros stay generic and we still get maximum codegen
# quality for the benchmark build.
- os: ubuntu-latest
arch: x86_64
tier: native
rustflags: ''
target_rustflags: '-C target-cpu=native'
label: ubuntu-x86_64-native
# Windows x86_64 — same dispatcher as Linux but validates the
# MSVC toolchain handles the intrinsics-heavy modules.
- os: windows-latest
arch: x86_64
tier: default
rustflags: ''
label: windows-x86_64-default
runs-on: ${{ matrix.os }}
env:
RUSTFLAGS: ${{ matrix.rustflags }}
# Per-target rustflags: only affects compilation of crates built
# for the named triple, never build-dependencies (proc macros,
# build scripts). The `native` tier uses this instead of
# `RUSTFLAGS` to avoid SIGILL when rustc loads host-compiled
# proc-macro dylibs that were codegen'd with `target-cpu=native`.
# Empty for tiers that don't opt in — cargo treats empty as unset.
CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS: ${{ matrix.target_rustflags || '' }}
steps:
- uses: actions/checkout@v6
- name: Install Rust
run: rustup update stable --no-self-update && rustup default stable
- name: Print CPU info (Linux)
if: runner.os == 'Linux'
shell: bash
run: |
echo "=== /proc/cpuinfo (first flags line) ==="
grep -m1 '^flags' /proc/cpuinfo || true
echo "=== lscpu ==="
lscpu || true
- name: Print CPU info (macOS)
if: runner.os == 'macOS'
shell: bash
run: |
echo "=== sysctl machdep.cpu ==="
sysctl machdep.cpu || true
echo "=== uname -m ==="
uname -m
- name: Print CPU info (Windows)
if: runner.os == 'Windows'
shell: pwsh
run: |
Get-CimInstance Win32_Processor | Select-Object Name, Manufacturer, NumberOfCores, NumberOfLogicalProcessors | Format-List
- name: Cache cargo build and registry
uses: actions/cache@v5
with:
path: |
~/.cargo/registry
~/.cargo/git
target
key: ${{ runner.os }}-bench-${{ matrix.tier }}-${{ hashFiles('**/Cargo.lock') }}
restore-keys: |
${{ runner.os }}-bench-${{ matrix.tier }}-
${{ runner.os }}-bench-
- name: Run benchmarks
shell: bash
# `--benches` limits cargo to the registered bench targets.
# Without it, `cargo bench` also runs the library's `#[test]`
# harness in release mode. Uses Criterion's native output
# format (per-bench `time:` / `thrpt:` blocks with confidence
# intervals) — the full HTML report lives in `target/criterion/`
# and is uploaded separately below.
run: cargo bench --benches | tee benchmark-all-${{ matrix.label }}.txt
continue-on-error: false
- name: Write run metadata
shell: bash
# Small metadata file alongside the raw Criterion output so
# downloaded archives are self-describing (which runner / tier /
# flags produced these numbers). Full per-bench results live in
# `benchmark-all-${LABEL}.txt` and the HTML report under
# `target/criterion/` — both uploaded below.
run: |
meta="benchmark-metadata-${{ matrix.label }}.md"
echo "# Benchmark metadata: ${{ matrix.label }}" > "$meta"
echo "" >> "$meta"
echo "- OS: ${{ matrix.os }}" >> "$meta"
echo "- Arch: ${{ matrix.arch }}" >> "$meta"
echo "- SIMD tier: ${{ matrix.tier }}" >> "$meta"
echo "- Runner: ${{ runner.name }}" >> "$meta"
echo "- Runner arch (GH): ${{ runner.arch }}" >> "$meta"
echo "- RUSTFLAGS: \`${{ matrix.rustflags }}\`" >> "$meta"
if [ -n "${{ matrix.target_rustflags || '' }}" ]; then
echo "- CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS: \`${{ matrix.target_rustflags || '' }}\`" >> "$meta"
fi
echo "- Date: $(date -u +"%Y-%m-%d %H:%M:%S UTC")" >> "$meta"
cat "$meta"
- name: Create benchmark archive
shell: bash
run: |
mkdir -p benchmark-results
mv benchmark-*.txt benchmark-results/ 2>/dev/null || true
mv benchmark-metadata-${{ matrix.label }}.md benchmark-results/ 2>/dev/null || true
if [ -d "target/criterion" ]; then
cp -r target/criterion benchmark-results/criterion-${{ matrix.label }} || true
fi
- name: Upload benchmark results
uses: actions/upload-artifact@v7
with:
name: benchmark-results-${{ matrix.label }}
path: benchmark-results/
retention-days: 90
- name: Upload Criterion detailed results
uses: actions/upload-artifact@v7
if: always()
with:
name: criterion-detailed-${{ matrix.label }}
path: target/criterion/
retention-days: 90
continue-on-error: false
# Aggregate results from all platforms and SIMD tiers.
aggregate-results:
name: Aggregate benchmark results
needs: benchmark
runs-on: ubuntu-latest
if: always()
steps:
- name: Download all benchmark results
uses: actions/download-artifact@v8
with:
path: all-results
- name: Write combined index
shell: bash
# Small top-level index listing the matrix entries and their
# metadata so the combined archive is self-describing. Full
# Criterion output (txt + HTML report) lives under
# `all-results/` per-matrix subdirectories.
run: |
index="BENCHMARK_INDEX.md"
echo "# Benchmark run: $(date -u +"%Y-%m-%d %H:%M:%S UTC")" > "$index"
echo "" >> "$index"
echo "Raw Criterion output per matrix entry is under" >> "$index"
echo "\`all-results/benchmark-results-<label>/\` (txt + HTML report)." >> "$index"
echo "" >> "$index"
for meta in all-results/benchmark-results-*/benchmark-metadata-*.md; do
if [ -f "$meta" ]; then
echo "" >> "$index"
cat "$meta" >> "$index"
echo "" >> "$index"
echo "---" >> "$index"
fi
done
cat "$index"
- name: Upload combined results
uses: actions/upload-artifact@v7
with:
name: benchmark-results-combined
path: |
BENCHMARK_INDEX.md
all-results/
retention-days: 90